Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
cf0511f2
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cf0511f2
编写于
2月 25, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add-async-ssa-graph-executor
test=develop
上级
43c82376
660e4106
变更
166
显示空白变更内容
内联
并排
Showing
166 changed file
with
3883 addition
and
1148 deletion
+3883
-1148
AUTHORS.md
AUTHORS.md
+2
-0
README.md
README.md
+11
-11
README_cn.md
README_cn.md
+11
-11
paddle/fluid/API.spec
paddle/fluid/API.spec
+4
-3
paddle/fluid/framework/block_desc.cc
paddle/fluid/framework/block_desc.cc
+14
-0
paddle/fluid/framework/block_desc.h
paddle/fluid/framework/block_desc.h
+2
-0
paddle/fluid/framework/data_layout_transform.cc
paddle/fluid/framework/data_layout_transform.cc
+7
-16
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+24
-6
paddle/fluid/framework/details/all_reduce_deps_pass.cc
paddle/fluid/framework/details/all_reduce_deps_pass.cc
+2
-2
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+5
-31
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+39
-8
paddle/fluid/framework/details/memory_optimize_helper.h
paddle/fluid/framework/details/memory_optimize_helper.h
+6
-4
paddle/fluid/framework/details/memory_optimize_helper_test.cc
...le/fluid/framework/details/memory_optimize_helper_test.cc
+4
-22
paddle/fluid/framework/details/memory_optimize_pass.cc
paddle/fluid/framework/details/memory_optimize_pass.cc
+6
-8
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+15
-3
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+4
-11
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+2
-4
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+2
-2
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-0
paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+4
-2
paddle/fluid/framework/ir/conv_bn_fuse_pass.h
paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+4
-2
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
.../fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
...e/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+4
-1
paddle/fluid/framework/ir/fc_fuse_pass.h
paddle/fluid/framework/ir/fc_fuse_pass.h
+4
-1
paddle/fluid/framework/ir/fc_gru_fuse_pass.h
paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+4
-2
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+6
-2
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+2
-1
paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+2
-1
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+3
-0
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+7
-1
paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+2
-1
paddle/fluid/framework/ir/lock_free_optimize_pass.h
paddle/fluid/framework/ir/lock_free_optimize_pass.h
+2
-1
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
...le/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
.../framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+151
-0
paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+4
-1
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
...e/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+2
-1
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+1
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+16
-2
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+35
-2
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+118
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+47
-42
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+3
-3
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+33
-8
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-5
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+2
-1
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+29
-18
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+17
-8
paddle/fluid/imperative/tracer.h
paddle/fluid/imperative/tracer.h
+6
-4
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+3
-3
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+2
-2
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+1
-1
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-2
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+2
-2
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+1
-1
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+4
-1
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+2
-1
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+1
-1
paddle/fluid/operators/beam_search_decode_op.h
paddle/fluid/operators/beam_search_decode_op.h
+1
-1
paddle/fluid/operators/benchmark/CMakeLists.txt
paddle/fluid/operators/benchmark/CMakeLists.txt
+3
-0
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+303
-0
paddle/fluid/operators/benchmark/op_tester.h
paddle/fluid/operators/benchmark/op_tester.h
+69
-0
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+114
-0
paddle/fluid/operators/benchmark/op_tester_config.h
paddle/fluid/operators/benchmark/op_tester_config.h
+51
-0
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+14
-45
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+1
-95
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+9
-24
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+34
-5
paddle/fluid/operators/detection/yolov3_loss_op.cc
paddle/fluid/operators/detection/yolov3_loss_op.cc
+23
-17
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+13
-13
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+5
-3
paddle/fluid/operators/lstmp_op.cc
paddle/fluid/operators/lstmp_op.cc
+10
-11
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+67
-35
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-0
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+20
-18
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+16
-14
paddle/fluid/operators/math/detail/lstm_kernel.h
paddle/fluid/operators/math/detail/lstm_kernel.h
+49
-13
paddle/fluid/operators/math/lstm_compute.cc
paddle/fluid/operators/math/lstm_compute.cc
+5
-4
paddle/fluid/operators/math/lstm_compute.cu
paddle/fluid/operators/math/lstm_compute.cu
+6
-6
paddle/fluid/operators/math/lstm_compute.h
paddle/fluid/operators/math/lstm_compute.h
+2
-2
paddle/fluid/operators/math/sample_prob.cc
paddle/fluid/operators/math/sample_prob.cc
+26
-0
paddle/fluid/operators/math/sample_prob.cu
paddle/fluid/operators/math/sample_prob.cu
+161
-0
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+118
-0
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+1
-11
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+23
-22
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+6
-2
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+3
-4
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+3
-4
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+1
-1
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+22
-3
paddle/fluid/operators/ngraph/CMakeLists.txt
paddle/fluid/operators/ngraph/CMakeLists.txt
+1
-0
paddle/fluid/operators/ngraph/ngraph_bridge.cc
paddle/fluid/operators/ngraph/ngraph_bridge.cc
+5
-34
paddle/fluid/operators/ngraph/ngraph_bridge.h
paddle/fluid/operators/ngraph/ngraph_bridge.h
+2
-7
paddle/fluid/operators/ngraph/ngraph_engine.cc
paddle/fluid/operators/ngraph/ngraph_engine.cc
+2
-4
paddle/fluid/operators/ngraph/ops/CMakeLists.txt
paddle/fluid/operators/ngraph/ops/CMakeLists.txt
+8
-0
paddle/fluid/operators/ngraph/ops/accuracy_op.h
paddle/fluid/operators/ngraph/ops/accuracy_op.h
+3
-0
paddle/fluid/operators/ngraph/ops/activation_op.h
paddle/fluid/operators/ngraph/ops/activation_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+5
-0
paddle/fluid/operators/ngraph/ops/conv2d_op.h
paddle/fluid/operators/ngraph/ops/conv2d_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+3
-0
paddle/fluid/operators/ngraph/ops/mean_op.h
paddle/fluid/operators/ngraph/ops/mean_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/momentum_op.h
paddle/fluid/operators/ngraph/ops/momentum_op.h
+3
-0
paddle/fluid/operators/ngraph/ops/mul_op.h
paddle/fluid/operators/ngraph/ops/mul_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/op_bridge.h
paddle/fluid/operators/ngraph/ops/op_bridge.h
+84
-0
paddle/fluid/operators/ngraph/ops/pool2d_op.h
paddle/fluid/operators/ngraph/ops/pool2d_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/scale_op.h
paddle/fluid/operators/ngraph/ops/scale_op.h
+3
-0
paddle/fluid/operators/ngraph/ops/softmax_op.h
paddle/fluid/operators/ngraph/ops/softmax_op.h
+4
-0
paddle/fluid/operators/ngraph/ops/top_k_op.h
paddle/fluid/operators/ngraph/ops/top_k_op.h
+3
-0
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+84
-54
paddle/fluid/operators/sample_logits_op.cc
paddle/fluid/operators/sample_logits_op.cc
+225
-0
paddle/fluid/operators/sample_logits_op.cu
paddle/fluid/operators/sample_logits_op.cu
+257
-0
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+245
-0
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+2
-2
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+2
-2
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+14
-2
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+1
-2
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-0
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+65
-0
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+45
-38
paddle/fluid/platform/mkldnn_utils.h
paddle/fluid/platform/mkldnn_utils.h
+69
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+49
-13
paddle/fluid/platform/profiler.cu
paddle/fluid/platform/profiler.cu
+10
-10
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+15
-47
paddle/fluid/platform/profiler_test.cc
paddle/fluid/platform/profiler_test.cc
+0
-1
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+4
-4
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+4
-4
paddle/fluid/pybind/ir.cc
paddle/fluid/pybind/ir.cc
+2
-1
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+2
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-3
paddle/fluid/train/demo/demo_trainer.cc
paddle/fluid/train/demo/demo_trainer.cc
+2
-2
paddle/fluid/train/test_train_recognize_digits.cc
paddle/fluid/train/test_train_recognize_digits.cc
+1
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-0
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+49
-24
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+2
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+51
-15
python/paddle/fluid/imperative/layers.py
python/paddle/fluid/imperative/layers.py
+23
-4
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+18
-19
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+3
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+10
-9
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+247
-25
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+22
-7
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+6
-12
python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+60
-1
python/paddle/fluid/tests/unittests/test_base_layer.py
python/paddle/fluid/tests/unittests/test_base_layer.py
+22
-15
python/paddle/fluid/tests/unittests/test_imperative.py
python/paddle/fluid/tests/unittests/test_imperative.py
+25
-22
python/paddle/fluid/tests/unittests/test_imperative_gan.py
python/paddle/fluid/tests/unittests/test_imperative_gan.py
+15
-15
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+71
-64
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
...n/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+8
-2
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
...on/paddle/fluid/tests/unittests/test_imperative_resnet.py
+40
-18
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
python/paddle/fluid/tests/unittests/test_lstmp_op.py
python/paddle/fluid/tests/unittests/test_lstmp_op.py
+39
-20
未找到文件。
AUTHORS.md
浏览文件 @
cf0511f2
...
@@ -44,6 +44,7 @@
...
@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang |
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
| Sand3r- | Michal Gallus |
| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |
| tianbingsz | Tian-Bing Xu |
...
@@ -54,6 +55,7 @@
...
@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang |
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
| wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang |
| wen-bo-yang | Wen-Bo Yang |
| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu |
| wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun |
| xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu |
| Xreki | Yi-Qun Liu |
...
...
README.md
浏览文件 @
cf0511f2
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
English |
[
简体中文
](
./README_cn.md
)
English |
[
简体中文
](
./README_cn.md
)
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
[

](LICENSE)
...
@@ -18,7 +18,7 @@ learning to many products at Baidu.
...
@@ -18,7 +18,7 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### Latest PaddlePaddle Release: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### Install Latest Stable Release:
### Install Latest Stable Release:
```
```
# Linux CPU
# Linux CPU
...
@@ -26,9 +26,9 @@ pip install paddlepaddle
...
@@ -26,9 +26,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
# For installation on other platform, refer to http://paddlepaddle.org/
```
```
...
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
...
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## Installation
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/beginners_guide/install/index_c
n.html
)
on our website.
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
en/1.3/beginners_guide/index_e
n.html
)
on our website.
## Documentation
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
and
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
documentation.
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
documentation.
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/user_guides/howto/training/cluster_howto
.html
)
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
en/1.3/user_guides/howto/training/multi_node_en
.html
)
You can run distributed training jobs on MPI clusters.
You can run distributed training jobs on MPI clusters.
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/api_cn/index_c
n.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
en/1.3/api/index_e
n.html
)
Our new API enables much shorter programs.
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/advanced_usage/development/contribute_to_paddle/index_c
n.html
)
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
en/1.3/advanced_usage/development/contribute_to_paddle/index_e
n.html
)
We appreciate your contributions!
We appreciate your contributions!
...
...
README_cn.md
浏览文件 @
cf0511f2
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
[
English
](
./README.md
)
| 简体中文
[
English
](
./README.md
)
| 简体中文
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
[

](LICENSE)
...
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
...
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的
[
版本说明
](
https://github.com/PaddlePaddle/Paddle/releases
)
跟进PaddlePaddle最新特性请参考我们的
[
版本说明
](
https://github.com/PaddlePaddle/Paddle/releases
)
### PaddlePaddle最新版本: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### PaddlePaddle最新版本: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### 安装最新稳定版本:
### 安装最新稳定版本:
```
```
# Linux CPU
# Linux CPU
...
@@ -24,9 +24,9 @@ pip install paddlepaddle
...
@@ -24,9 +24,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
```
...
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
...
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## 安装
## 安装
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/install/index_cn.html
)
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/install/index_cn.html
)
## 文档
## 文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
和
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
文档
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
文档
-
[
深度学习101
](
https://github.com/PaddlePaddle/book
)
-
[
深度学习101
](
https://github.com/PaddlePaddle/book
)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
2/user_guides/howto/training/cluster_howto
.html
)
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
3/user_guides/howto/training/multi_node
.html
)
可以在MPI集群上运行分布式训练任务
可以在MPI集群上运行分布式训练任务
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/api_cn/index_cn.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/api_cn/index_cn.html
)
新的API支持代码更少更简洁的程序
新的API支持代码更少更简洁的程序
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
欢迎您的贡献!
欢迎您的贡献!
...
...
paddle/fluid/API.spec
浏览文件 @
cf0511f2
...
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
...
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program
_or_graph
'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
...
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
...
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'
], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32'
, None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'
, 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None
, None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
...
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
...
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
...
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
...
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
None, None, None, None
))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
'float32', -1.0, 1.0, 0
))
paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/block_desc.cc
浏览文件 @
cf0511f2
...
@@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
...
@@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
return
res
;
return
res
;
}
}
void
BlockDesc
::
Clear
()
{
// clear all ops
ops_
.
clear
();
// clear all vars which are not persistable
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
it
->
second
->
Persistable
())
{
++
it
;
}
else
{
vars_
.
erase
(
it
++
);
}
}
}
void
BlockDesc
::
Flush
()
{
void
BlockDesc
::
Flush
()
{
for
(
auto
&
op_desc
:
ops_
)
{
for
(
auto
&
op_desc
:
ops_
)
{
op_desc
->
Flush
();
op_desc
->
Flush
();
...
...
paddle/fluid/framework/block_desc.h
浏览文件 @
cf0511f2
...
@@ -97,6 +97,8 @@ class BlockDesc {
...
@@ -97,6 +97,8 @@ class BlockDesc {
std
::
vector
<
OpDesc
*>
AllOps
()
const
;
std
::
vector
<
OpDesc
*>
AllOps
()
const
;
void
Clear
();
size_t
OpSize
()
const
{
return
ops_
.
size
();
}
size_t
OpSize
()
const
{
return
ops_
.
size
();
}
OpDesc
*
Op
(
int
idx
)
const
{
return
ops_
.
at
(
idx
).
get
();
}
OpDesc
*
Op
(
int
idx
)
const
{
return
ops_
.
at
(
idx
).
get
();
}
...
...
paddle/fluid/framework/data_layout_transform.cc
浏览文件 @
cf0511f2
...
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
...
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout
=
out_layout
=
out_layout
==
DataLayout
::
kAnyLayout
?
DataLayout
::
kNCHW
:
out_layout
;
out_layout
==
DataLayout
::
kAnyLayout
?
DataLayout
::
kNCHW
:
out_layout
;
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
expected_kernel_type
.
place_
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
std
::
vector
<
int
>
in_tz
=
paddle
::
framework
::
vectorize2int
(
in
.
dims
());
std
::
vector
<
int
>
in_tz
=
paddle
::
framework
::
vectorize2int
(
in
.
dims
());
std
::
vector
<
int
>
out_tz
=
in_tz
;
std
::
vector
<
int
>
out_tz
=
in_tz
;
...
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
...
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s"
,
in
.
type
());
"Input tensor type is not supported: %s"
,
in
.
type
());
memory
::
data_type
out_type
=
in_type
;
memory
::
data_type
out_type
=
in_type
;
auto
in_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
in
.
format
());
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
ToMKLDNNFormat
(
out_layout
));
// output tensor has the same dims as input. Reorder don't change dims
// output tensor has the same dims as input. Reorder don't change dims
out
->
Resize
(
in
.
dims
());
out
->
Resize
(
in
.
dims
());
if
(
in_format
!=
out_format
)
{
// tempory mem pd fr out , to make reorder
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
out
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
,
out_type
);
if
(
in
.
get_mkldnn_prim_desc
()
!=
out_mem_pd
)
{
void
*
in_data
=
GetDataFromTensor
(
in
,
in_type
);
void
*
in_data
=
GetDataFromTensor
(
in
,
in_type
);
auto
out_data
=
out
->
mutable_data
(
expected_kernel_type
.
place_
,
in
.
type
());
auto
out_data
=
out
->
mutable_data
(
expected_kernel_type
.
place_
,
in
.
type
());
auto
in_memory
=
auto
in_memory
=
memory
(
in
.
get_mkldnn_prim_desc
(),
in_data
);
memory
({{{
in_tz
},
in_type
,
in_format
},
cpu_engine
},
in_data
);
auto
out_memory
=
memory
(
out_mem_pd
,
out_data
);
auto
out_memory
=
memory
({{{
out_tz
},
out_type
,
out_format
},
cpu_engine
},
out_data
);
platform
::
Reorder
(
in_memory
,
out_memory
);
platform
::
Reorder
(
in_memory
,
out_memory
);
}
else
{
}
else
{
out
->
ShareDataWith
(
in
);
out
->
ShareDataWith
(
in
);
}
}
out
->
set_layout
(
out_layout
);
out
->
set_layout
(
out_layout
);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out
->
set_format
(
memory
::
format
::
format_undef
);
#endif
#endif
}
}
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
cf0511f2
...
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
...
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
// Just set layout/format. No real transform occur
out
.
ShareDataWith
(
input_tensor
);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto
mkldnn_fmt
=
[
&
](
int
rank
)
{
switch
(
rank
)
{
case
5
:
return
mkldnn
::
memory
::
format
::
ncdhw
;
case
4
:
return
mkldnn
::
memory
::
format
::
nchw
;
case
3
:
return
mkldnn
::
memory
::
format
::
ncw
;
case
2
:
return
mkldnn
::
memory
::
format
::
nc
;
case
1
:
return
mkldnn
::
memory
::
format
::
x
;
default:
return
mkldnn
::
memory
::
format
::
blocked
;
}
};
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in
.
dims
().
size
(),
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
ToMKLDNNFormat
(
lin
));
paddle
::
framework
::
vectorize2int
(
out
.
dims
()),
mkldnn_fmt
(
out
.
dims
().
size
()));
out
.
ShareDataWith
(
input_tensor
);
out
.
set_mkldnn_prim_desc
(
out_mem_pd
);
out
.
set_layout
(
DataLayout
::
kMKLDNN
);
out
.
set_format
(
out_format
);
#endif
#endif
}
else
{
}
else
{
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
...
...
paddle/fluid/framework/details/all_reduce_deps_pass.cc
浏览文件 @
cf0511f2
...
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
...
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std
::
unordered_map
<
std
::
string
,
int
>
vars
;
std
::
unordered_map
<
std
::
string
,
int
>
vars
;
// TODO(gongwb): use graph topology sort to find the order of operators.
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
// Note that must assert topology sort is stable
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAll
OpDescs
);
auto
&
ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kStaleProgram
OpDescs
);
for
(
auto
*
op_desc
:
ops
)
{
for
(
auto
*
op_desc
:
ops
)
{
auto
outputs
=
op_desc
->
Outputs
();
auto
outputs
=
op_desc
->
Outputs
();
for
(
auto
&
o_it
:
outputs
)
{
for
(
auto
&
o_it
:
outputs
)
{
...
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
...
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
REGISTER_PASS
(
all_reduce_deps_pass
,
REGISTER_PASS
(
all_reduce_deps_pass
,
paddle
::
framework
::
details
::
AllReduceDepsPass
)
paddle
::
framework
::
details
::
AllReduceDepsPass
)
.
Require
PassAttr
(
paddle
::
framework
::
details
::
kAll
OpDescs
);
.
Require
GraphAttr
(
paddle
::
framework
::
details
::
kStaleProgram
OpDescs
);
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
cf0511f2
...
@@ -136,14 +136,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
...
@@ -136,14 +136,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
ir
::
Pass
*
multi_devices_pass
;
ir
::
Pass
*
multi_devices_pass
;
if
(
strategy_
.
is_distribution_
)
{
if
(
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"multi device parameter server mode"
;
multi_devices_pass
=
AppendPass
(
"dist_multi_devices_pass"
).
get
();
multi_devices_pass
=
AppendPass
(
"dist_multi_devices_pass"
).
get
();
}
else
if
(
strategy_
.
async_mode_
)
{
}
else
if
(
strategy_
.
async_mode_
)
{
multi_devices_pass
=
AppendPass
(
"async_multi_devices_pass"
).
get
();
multi_devices_pass
=
AppendPass
(
"async_multi_devices_pass"
).
get
();
}
else
{
}
else
{
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
VLOG
(
3
)
<<
"multi devices collective mode with allreduce"
;
multi_devices_pass
=
multi_devices_pass
=
AppendPass
(
"allreduce_mode_multi_devices_pass"
).
get
();
AppendPass
(
"allreduce_mode_multi_devices_pass"
).
get
();
}
else
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
}
else
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
VLOG
(
3
)
<<
"multi deivces collective mode with reduce"
;
multi_devices_pass
=
AppendPass
(
"reduce_mode_multi_devices_pass"
).
get
();
multi_devices_pass
=
AppendPass
(
"reduce_mode_multi_devices_pass"
).
get
();
}
else
{
}
else
{
PADDLE_THROW
(
"Unknown reduce strategy."
);
PADDLE_THROW
(
"Unknown reduce strategy."
);
...
@@ -174,7 +177,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
...
@@ -174,7 +177,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
}
}
std
::
unique_ptr
<
ir
::
Graph
>
BuildStrategy
::
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
BuildStrategy
::
Apply
(
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
@@ -185,7 +189,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -185,7 +189,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
// Create a default one if not finalized by user.
// Create a default one if not finalized by user.
CreatePassesFromStrategy
(
false
);
CreatePassesFromStrategy
(
false
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
));
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
if
(
IsMultiDevPass
(
pass
->
Type
()))
{
if
(
IsMultiDevPass
(
pass
->
Type
()))
{
pass
->
Erase
(
kPlaces
);
pass
->
Erase
(
kPlaces
);
...
@@ -203,41 +206,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -203,41 +206,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
#endif
}
else
if
(
pass
->
Type
()
==
"memory_optimize_pass"
)
{
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
const
std
::
vector
<
OpDesc
*>
*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
<<
enable_sequential_execution_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
LOG
(
INFO
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
LOG
(
INFO
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
<<
", num_trainers:"
<<
num_trainers_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"inplace_pass"
)
{
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
if
(
!
use_cuda
)
{
if
(
!
use_cuda
)
{
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
cf0511f2
...
@@ -115,7 +115,7 @@ struct BuildStrategy {
...
@@ -115,7 +115,7 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
// applied to the Program and output an ir::Graph.
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
const
ProgramDesc
&
main_program
,
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
cf0511f2
...
@@ -24,12 +24,11 @@ namespace details {
...
@@ -24,12 +24,11 @@ namespace details {
FastThreadedSSAGraphExecutor
::
FastThreadedSSAGraphExecutor
(
FastThreadedSSAGraphExecutor
::
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
strategy_
(
strategy
),
:
strategy_
(
strategy
),
local_scopes_
(
local_scopes
),
local_scopes_
(
local_scopes
),
places_
(
places
),
places_
(
places
),
graph_
(
std
::
move
(
graph
)
),
graph_
(
graph
),
pool_
(
strategy
.
num_threads_
),
pool_
(
strategy
.
num_threads_
),
prepare_pool_
(
1
),
// add one more thread for generate op_deps
prepare_pool_
(
1
),
// add one more thread for generate op_deps
fetch_ctxs_
(
places
)
{
fetch_ctxs_
(
places
)
{
...
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
...
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
}
}
}
if
(
exception_
.
IsCaught
())
{
if
(
exception_
.
IsCaught
())
{
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_
.
ReThrow
();
exception_
.
ReThrow
();
}
}
}
}
num_complete
+=
num_comp
;
num_complete
+=
num_comp
;
}
}
// Wait FetchOps.
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetches
;
return
fetches
;
}
}
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
cf0511f2
...
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
const
ir
::
Graph
&
Graph
()
const
override
;
const
ir
::
Graph
&
Graph
()
const
override
;
...
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
ExecutionStrategy
strategy_
;
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unordered_map
<
OpHandleBase
*
,
int
>
op_deps_
;
std
::
unordered_map
<
OpHandleBase
*
,
int
>
op_deps_
;
std
::
vector
<
OpHandleBase
*>
bootstrap_ops_
;
std
::
vector
<
OpHandleBase
*>
bootstrap_ops_
;
...
...
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
cf0511f2
...
@@ -33,10 +33,10 @@ namespace details {
...
@@ -33,10 +33,10 @@ namespace details {
using
paddle
::
framework
::
VarDesc
;
using
paddle
::
framework
::
VarDesc
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
k
All
OpDescs
),
PADDLE_ENFORCE
(
graph
.
Has
(
k
StaleProgram
OpDescs
),
"Graph has no attribute of k
All
OpDescs."
);
"Graph has no attribute of k
StaleProgram
OpDescs."
);
// 1. get op desc order
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
k
All
OpDescs
);
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
k
StaleProgram
OpDescs
);
// 2. topology sort order
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
auto
nodes
=
graph
.
Nodes
();
...
@@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
...
@@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
}
}
}
}
}
}
for
(
auto
*
op
:
ops_
)
{
unlived_vars_
[
op
]
=
std
::
set
<
std
::
string
>
();
for
(
auto
&
var
:
this
->
LiveIn
(
op
))
{
if
(
!
this
->
LiveOut
(
op
).
count
(
var
))
{
unlived_vars_
[
op
].
insert
(
var
);
}
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
int
begin_idx
)
{
std
::
vector
<
bool
>
need_update
(
ops_
.
size
(),
false
);
// update graph from begin idx to the end
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
auto
*
op
=
ops_
[
i
];
...
@@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
...
@@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
live_in_
[
op
].
insert
(
new_node
);
need_update
[
i
]
=
true
;
}
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
live_out_
[
op
].
insert
(
new_node
);
need_update
[
i
]
=
true
;
}
}
for
(
size_t
i
=
begin_idx
;
i
<
ops_
.
size
();
++
i
)
{
if
(
!
need_update
[
i
])
continue
;
auto
*
op
=
ops_
[
i
];
for
(
auto
&
var
:
this
->
LiveIn
(
op
))
{
if
(
!
this
->
LiveOut
(
op
).
count
(
var
))
{
unlived_vars_
[
op
].
insert
(
var
);
}
}
}
}
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
it
!=
live_in_
.
end
(),
...
@@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
...
@@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
return
it
->
second
;
return
it
->
second
;
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
it
!=
live_out_
.
end
(),
...
@@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
...
@@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
return
it
->
second
;
return
it
->
second
;
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
string
::
Sprintf
(
"Expect %s in use, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>&
ControlFlowGraph
::
Unlived
(
ir
::
Node
*
op
)
const
{
auto
it
=
unlived_vars_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
unlived_vars_
.
end
(),
string
::
Sprintf
(
"Expect %s in unlived_set, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
return
it
->
second
;
return
it
->
second
;
}
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
const
std
::
vector
<
ir
::
Node
*>
&
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
...
...
paddle/fluid/framework/details/memory_optimize_helper.h
浏览文件 @
cf0511f2
...
@@ -92,10 +92,11 @@ class ControlFlowGraph {
...
@@ -92,10 +92,11 @@ class ControlFlowGraph {
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
const
std
::
set
<
std
::
string
>&
Unlived
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>&
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
// for ssa-graph nodes
...
@@ -117,6 +118,7 @@ class ControlFlowGraph {
...
@@ -117,6 +118,7 @@ class ControlFlowGraph {
VarSetMap
live_out_
;
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
VarSetMap
defs_
;
// op outputs
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
unlived_vars_
;
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
};
...
...
paddle/fluid/framework/details/memory_optimize_helper_test.cc
浏览文件 @
cf0511f2
...
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
...
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
// prepare ir graph
// prepare ir graph
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
cfg
.
LiveVariableAnalysis
();
...
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
...
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
...
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
...
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
ir
::
Node
*
found_node
=
nullptr
;
...
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
...
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
// 3. add some op_desc
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
...
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
...
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
// cached desc different with real one
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
...
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
ir
::
Node
*
ret
=
nullptr
;
...
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
...
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
return
ret
;
return
ret
;
};
};
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
// remove sum node
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
for
(
auto
node
:
nodes
)
{
...
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
...
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
ir
::
Node
*
ret
=
nullptr
;
...
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
...
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
return
ret
;
return
ret
;
};
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
...
paddle/fluid/framework/details/memory_optimize_pass.cc
浏览文件 @
cf0511f2
...
@@ -118,8 +118,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -118,8 +118,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
}
}
// fill the pool
// fill the pool
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
for
(
auto
&
var
:
cfg_
->
Unlived
(
op
))
{
if
(
cfg_
->
LiveOut
(
op
).
count
(
var
)
==
0
)
{
ir
::
Node
*
var_node
=
cfg_
->
GetNodeByName
(
var
,
op
);
ir
::
Node
*
var_node
=
cfg_
->
GetNodeByName
(
var
,
op
);
if
(
var_node
==
nullptr
||
var_node
->
IsCtrlVar
())
continue
;
if
(
var_node
==
nullptr
||
var_node
->
IsCtrlVar
())
continue
;
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
...
@@ -127,7 +126,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -127,7 +126,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
}
}
}
}
}
graph
->
ResolveHazard
(
var_nodes_
);
graph
->
ResolveHazard
(
var_nodes_
);
return
graph
;
return
graph
;
...
@@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
...
@@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
REGISTER_PASS
(
memory_optimize_pass
,
REGISTER_PASS
(
memory_optimize_pass
,
paddle
::
framework
::
details
::
MemoryOptimizePass
)
paddle
::
framework
::
details
::
MemoryOptimizePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
k
All
OpDescs
);
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
k
StaleProgram
OpDescs
);
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
cf0511f2
...
@@ -969,9 +969,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
...
@@ -969,9 +969,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
}
}
void
DistSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
void
DistSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
if
(
need_broadcast_var_
||
// broad cast received parameters when training in parameter server mode.
(
UseGPU
()
&&
if
(
need_broadcast_var_
)
{
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
))
{
// There are 4 conditions:
// 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
// Need to broadcast received parameters to other GPU.
// 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
// broadcast received parameters to other GPU.
// 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
// broadcast received parameters to other scope.
// 4. CPU && Reduce: because all parameters share the same memory, did not
// broadcast received parameters.
if
(
!
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
;
}
if
(
strategy_
.
fuse_broadcast_op_
)
{
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
}
else
{
}
else
{
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
cf0511f2
...
@@ -20,8 +20,7 @@ namespace framework {
...
@@ -20,8 +20,7 @@ namespace framework {
namespace
details
{
namespace
details
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
)
{
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
graphs
.
reserve
(
places_
.
size
());
graphs
.
reserve
(
places_
.
size
());
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
...
@@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
...
@@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
const
framework
::
ProgramDesc
&
main_prog
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
strategy_
(
std
::
move
(
strategy
)),
:
strategy_
(
std
::
move
(
strategy
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
places_
(
std
::
move
(
places
)),
places_
(
std
::
move
(
places
)),
main_prog_
(
main_prog
),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
// attrs.
graphs_
(
SeparateMultiDevicesGraph
(
std
::
move
(
graph
)
))
{
graphs_
(
SeparateMultiDevicesGraph
(
graph
))
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
auto
seq_allreduce_pass
=
auto
seq_allreduce_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"all_reduce_deps_pass"
);
ir
::
PassRegistry
::
Instance
().
Get
(
"all_reduce_deps_pass"
);
seq_allreduce_pass
->
Erase
(
details
::
kAllOpDescs
);
seq_allreduce_pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_prog_
.
Block
(
0
).
AllOps
()));
for
(
size_t
i
=
0
;
i
<
graphs_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
graphs_
.
size
();
++
i
)
{
graphs_
[
i
]
=
seq_allreduce_pass
->
Apply
(
std
::
move
(
graphs_
[
i
]));
graphs_
[
i
]
=
seq_allreduce_pass
->
Apply
(
std
::
move
(
graphs_
[
i
]));
}
}
...
@@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
...
@@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<<
" to run the operators of the graph on each device."
;
<<
" to run the operators of the graph on each device."
;
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
executors_
.
emplace_back
(
new
details
::
ThreadedSSAGraphExecutor
(
executors_
.
emplace_back
(
new
details
::
ThreadedSSAGraphExecutor
(
strategy_
,
local_scopes_
,
{
places_
[
i
]},
std
::
move
(
graphs_
.
at
(
i
)
)));
strategy_
,
local_scopes_
,
{
places_
[
i
]},
graphs_
.
at
(
i
).
get
(
)));
}
}
}
}
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
浏览文件 @
cf0511f2
...
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
framework
::
ProgramDesc
&
main_prog
,
ir
::
Graph
*
graph
);
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
~
ParallelSSAGraphExecutor
()
final
=
default
;
~
ParallelSSAGraphExecutor
()
final
=
default
;
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
...
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
private:
private:
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
ExecutionStrategy
strategy_
;
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
{
nullptr
};
std
::
unique_ptr
<::
ThreadPool
>
pool_
{
nullptr
};
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
framework
::
ProgramDesc
main_prog_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
std
::
vector
<
std
::
unique_ptr
<
details
::
ThreadedSSAGraphExecutor
>>
executors_
;
std
::
vector
<
std
::
unique_ptr
<
details
::
ThreadedSSAGraphExecutor
>>
executors_
;
...
...
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
cf0511f2
...
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
...
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
static
std
::
unordered_set
<
std
::
string
>
skip_dist_ops
{
static
std
::
unordered_set
<
std
::
string
>
skip_dist_ops
{
"send"
,
"recv"
,
"send_barrier"
,
"fetch_barrier"
};
"send"
,
"recv"
,
"send_barrier"
,
"fetch_barrier"
};
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAll
OpDescs
);
auto
&
ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kStaleProgram
OpDescs
);
std
::
vector
<
ir
::
Node
*>
op_node_list
;
std
::
vector
<
ir
::
Node
*>
op_node_list
;
op_node_list
.
reserve
(
ops
.
size
());
op_node_list
.
reserve
(
ops
.
size
());
...
@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
...
@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
REGISTER_PASS
(
sequential_execution_pass
,
REGISTER_PASS
(
sequential_execution_pass
,
paddle
::
framework
::
details
::
SequentialExecutionPass
)
paddle
::
framework
::
details
::
SequentialExecutionPass
)
.
Require
PassAttr
(
paddle
::
framework
::
details
::
kAll
OpDescs
);
.
Require
GraphAttr
(
paddle
::
framework
::
details
::
kStaleProgram
OpDescs
);
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
cf0511f2
...
@@ -23,9 +23,8 @@ namespace framework {
...
@@ -23,9 +23,8 @@ namespace framework {
namespace
details
{
namespace
details
{
ThreadedSSAGraphExecutor
::
ThreadedSSAGraphExecutor
(
ThreadedSSAGraphExecutor
::
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
graph_
(
graph
),
:
graph_
(
std
::
move
(
graph
)),
pool_
(
strategy
.
num_threads_
>=
2
?
new
::
ThreadPool
(
strategy
.
num_threads_
)
pool_
(
strategy
.
num_threads_
>=
2
?
new
::
ThreadPool
(
strategy
.
num_threads_
)
:
nullptr
),
:
nullptr
),
local_scopes_
(
local_scopes
),
local_scopes_
(
local_scopes
),
...
@@ -123,7 +122,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
...
@@ -123,7 +122,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
run_op_future
.
wait
();
}
}
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_holder_
.
ReThrow
();
exception_holder_
.
ReThrow
();
}
else
{
}
else
{
continue
;
continue
;
...
@@ -148,7 +147,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
...
@@ -148,7 +147,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
}
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
PADDLE_ENFORCE
(
ready_ops
.
empty
());
// Wait FetchOps.
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetch_data
;
return
fetch_data
;
}
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
cf0511f2
...
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
// Run a SSAGraph by a thread pool
...
@@ -56,7 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -56,7 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details
::
OpHandleBase
*
op
);
details
::
OpHandleBase
*
op
);
private:
private:
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
...
@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
cc_test
(
test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
cc_test
(
test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass
)
cc_test
(
test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass
)
endif
()
endif
()
paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -22,7 +22,8 @@ namespace ir {
...
@@ -22,7 +22,8 @@ namespace ir {
class
AttentionLSTMFusePass
:
public
FusePassBase
{
class
AttentionLSTMFusePass
:
public
FusePassBase
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
...
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
virtual
~
ConvAffineChannelFusePass
()
{}
virtual
~
ConvAffineChannelFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"conv_affine_channel_fuse"
};
const
std
::
string
name_scope_
{
"conv_affine_channel_fuse"
};
};
};
...
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
...
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
virtual
~
ConvEltwiseAddAffineChannelFusePass
()
{}
virtual
~
ConvEltwiseAddAffineChannelFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"conv_eltwiseadd_affine_channel_fuse"
};
const
std
::
string
name_scope_
{
"conv_eltwiseadd_affine_channel_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/conv_bn_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
...
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
virtual
~
ConvBNFusePass
()
{}
virtual
~
ConvBNFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"conv_bn_fuse"
};
const
std
::
string
name_scope_
{
"conv_bn_fuse"
};
};
};
...
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
...
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
virtual
~
ConvEltwiseAddBNFusePass
()
{}
virtual
~
ConvEltwiseAddBNFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"conv_eltwiseadd_bn_fuse"
};
const
std
::
string
name_scope_
{
"conv_eltwiseadd_bn_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
...
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
virtual
~
ConvElementwiseAdd2ActFusePass
()
{}
virtual
~
ConvElementwiseAdd2ActFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
...
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
virtual
~
ConvElementwiseAddActFusePass
()
{}
virtual
~
ConvElementwiseAddActFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
...
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
virtual
~
ConvElementwiseAddFusePass
()
{}
virtual
~
ConvElementwiseAddFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -14,6 +14,8 @@
...
@@ -14,6 +14,8 @@
#pragma once
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
...
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
virtual
~
EmbeddingFCLSTMFusePass
()
{}
virtual
~
EmbeddingFCLSTMFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"embedding_fc_lstm_fuse"
};
const
std
::
string
name_scope_
{
"embedding_fc_lstm_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/fc_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
...
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
virtual
~
FCFusePass
()
{}
virtual
~
FCFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/fc_gru_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
...
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
virtual
~
FCGRUFusePass
()
{}
virtual
~
FCGRUFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"fc_gru_fuse"
};
const
std
::
string
name_scope_
{
"fc_gru_fuse"
};
};
};
...
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
...
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
virtual
~
MulGRUFusePass
()
{}
virtual
~
MulGRUFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"fc_nobias_gru_fuse"
};
const
std
::
string
name_scope_
{
"fc_nobias_gru_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -14,6 +14,8 @@
...
@@ -14,6 +14,8 @@
#pragma once
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
...
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
virtual
~
FCLstmFusePass
()
{}
virtual
~
FCLstmFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"fc_lstm_fuse"
};
const
std
::
string
name_scope_
{
"fc_lstm_fuse"
};
};
};
...
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
...
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
virtual
~
MulLstmFusePass
()
{}
virtual
~
MulLstmFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"fc_nobias_lstm_fuse"
};
const
std
::
string
name_scope_
{
"fc_nobias_lstm_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
浏览文件 @
cf0511f2
...
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
...
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
virtual
~
FuseElewiseAddActPass
()
{}
virtual
~
FuseElewiseAddActPass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
FuseElewiseAddAct
(
std
::
unique_ptr
<
ir
::
Graph
>
FuseElewiseAddAct
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
...
...
paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
浏览文件 @
cf0511f2
...
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
...
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
virtual
~
FuseReluDepthwiseConvPass
()
{}
virtual
~
FuseReluDepthwiseConvPass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
FuseReluDepthwiseConv
(
std
::
unique_ptr
<
ir
::
Graph
>
FuseReluDepthwiseConv
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
bool
only_forward
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
bool
only_forward
)
const
;
};
};
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
cf0511f2
...
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
...
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
var
->
inputs
.
push_back
(
node
);
var
->
inputs
.
push_back
(
node
);
}
}
}
}
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
program
.
Block
(
0
).
AllOps
()));
return
var_nodes
;
return
var_nodes
;
}
}
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,7 @@ namespace details {
...
@@ -31,7 +31,7 @@ namespace details {
// This attr is not recommended, because the graph should not dependence
// This attr is not recommended, because the graph should not dependence
// the program once it is built.
// the program once it is built.
constexpr
char
k
AllOpDescs
[]
=
"all
_op_descs"
;
constexpr
char
k
StaleProgramOpDescs
[]
=
"stale_program
_op_descs"
;
}
// namespace details
}
// namespace details
namespace
ir
{
namespace
ir
{
...
@@ -195,6 +195,12 @@ class Graph {
...
@@ -195,6 +195,12 @@ class Graph {
return
nullptr
;
return
nullptr
;
}
}
// Returns reference to the original program.
// WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram.
const
ProgramDesc
&
OriginProgram
()
const
{
return
program_
;
}
// This method takes ownership of `node`.
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
...
...
paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
浏览文件 @
cf0511f2
...
@@ -22,7 +22,8 @@ namespace ir {
...
@@ -22,7 +22,8 @@ namespace ir {
class
IdentityScaleOpCleanPass
:
public
FusePassBase
{
class
IdentityScaleOpCleanPass
:
public
FusePassBase
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
private:
virtual
~
IdentityScaleOpCleanPass
()
=
default
;
virtual
~
IdentityScaleOpCleanPass
()
=
default
;
...
...
paddle/fluid/framework/ir/lock_free_optimize_pass.h
浏览文件 @
cf0511f2
...
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
...
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
virtual
~
LockFreeOptimizePass
()
{}
virtual
~
LockFreeOptimizePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
private:
// Create a new sgd node via current optimizer node
// Create a new sgd node via current optimizer node
...
...
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
...
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
virtual
bool
is_conv3d
()
const
{
return
false
;
}
virtual
bool
is_conv3d
()
const
{
return
false
;
}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"conv_bias_mkldnn_fuse"
};
const
std
::
string
name_scope_
{
"conv_bias_mkldnn_fuse"
};
};
};
/*
/*
...
...
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
0 → 100644
浏览文件 @
cf0511f2
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
if
(
type
==
"conv2d"
)
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
op
->
SetAttr
(
"name"
,
name
);
op
->
SetInput
(
"Input"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Filter"
,
{
inputs
[
1
]});
if
(
inputs
.
size
()
>
2
)
op
->
SetInput
(
"Bias"
,
{
inputs
[
2
]});
else
op
->
SetInput
(
"Bias"
,
{});
}
else
if
(
type
==
"elementwise_add"
)
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Y"
,
{
inputs
[
1
]});
}
op
->
SetOutput
(
"Out"
,
outputs
);
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
// (c, weights)->conv->f
// (f)->elementwise_add->g
ProgramDesc
BuildProgramDesc
(
bool
convWithExistingBias
)
{
ProgramDesc
prog
;
std
::
vector
<
std
::
string
>
nodes
{
"c"
,
"weights"
,
"f"
,
"eltwise_bias"
,
"g"
};
if
(
convWithExistingBias
)
nodes
.
push_back
(
"conv_bias"
);
for
(
auto
&
v
:
nodes
)
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
if
(
v
==
"weights"
||
v
==
"conv_bias"
||
v
==
"eltwise_bias"
)
{
var
->
SetPersistable
(
true
);
}
}
// conv+bias, both with MKL-DNN
if
(
convWithExistingBias
)
{
SetOp
(
&
prog
,
"conv2d"
,
"conv"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights"
,
"conv_bias"
}),
std
::
vector
<
std
::
string
>
({
"f"
}));
}
else
{
SetOp
(
&
prog
,
"conv2d"
,
"conv"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights"
}),
std
::
vector
<
std
::
string
>
({
"f"
}));
}
SetOp
(
&
prog
,
"elementwise_add"
,
"eltwise"
,
std
::
vector
<
std
::
string
>
({
"f"
,
"eltwise_bias"
}),
std
::
vector
<
std
::
string
>
({
"g"
}));
return
prog
;
}
void
InitTensorHolder
(
Scope
*
scope
,
const
paddle
::
platform
::
Place
&
place
,
const
char
*
var_name
)
{
auto
x
=
scope
->
Var
(
var_name
);
auto
tensor
=
x
->
GetMutable
<
LoDTensor
>
();
tensor
->
mutable_data
(
place
,
proto
::
VarType
::
FP32
,
::
paddle
::
memory
::
Allocator
::
kDefault
,
1
);
}
void
MainTest
(
bool
convWithExistingBias
)
{
auto
prog
=
BuildProgramDesc
(
convWithExistingBias
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
auto
place
=
paddle
::
platform
::
CPUPlace
();
NaiveExecutor
exe
{
place
};
Scope
scope
;
// Init scope, as it is used in pass
exe
.
CreateVariables
(
prog
,
0
,
true
,
&
scope
);
if
(
convWithExistingBias
)
{
InitTensorHolder
(
&
scope
,
place
,
"conv_bias"
);
InitTensorHolder
(
&
scope
,
place
,
"eltwise_bias"
);
}
graph
->
Set
(
kParamScopeAttr
,
new
framework
::
Scope
*
(
&
scope
));
auto
pass
=
PassRegistry
::
Instance
().
Get
(
"conv_bias_mkldnn_fuse_pass"
);
int
original_nodes_num
=
graph
->
Nodes
().
size
();
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
int
current_nodes_num
=
graph
->
Nodes
().
size
();
// Remove 3 Nodes: Conv, Bias, conv_out
// Add 1 Node: ConvBias
EXPECT_EQ
(
original_nodes_num
-
2
,
current_nodes_num
);
// Assert conv_bias op in newly generated graph
int
conv_bias_count
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
->
Type
()
==
"conv2d"
)
{
auto
*
op
=
node
->
Op
();
ASSERT_TRUE
(
op
->
HasAttr
(
"use_mkldnn"
));
EXPECT_TRUE
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"use_mkldnn"
)));
// check if "conv" convolution is fused
auto
op_name
=
boost
::
get
<
std
::
string
>
(
op
->
GetAttr
(
"name"
));
if
(
op_name
==
"conv"
)
{
auto
input_names
=
op
->
InputNames
();
ASSERT_TRUE
(
std
::
find
(
input_names
.
begin
(),
input_names
.
end
(),
"Bias"
)
!=
input_names
.
end
());
auto
bias
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
op
->
Input
(
"Bias"
));
if
(
bias
.
size
())
{
++
conv_bias_count
;
}
}
}
}
EXPECT_EQ
(
conv_bias_count
,
1
);
}
TEST
(
ConvBiasFusePass
,
bias_free_conv
)
{
MainTest
(
false
);
}
TEST
(
ConvBiasFusePass
,
conv_with_existing_bias
)
{
MainTest
(
true
);
}
TEST
(
ConvBiasFusePass
,
conv3d
)
{
Conv3DBiasFusePass
pass
;
ASSERT_TRUE
(
pass
.
is_conv3d
());
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
conv_bias_mkldnn_fuse_pass
);
paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
...
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
virtual
~
RepeatedFCReluFusePass
()
{}
virtual
~
RepeatedFCReluFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"repeated_fc_relu_fuse"
};
const
std
::
string
name_scope_
{
"repeated_fc_relu_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
...
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
...
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
virtual
~
SeqConcatFcFusePass
()
{}
virtual
~
SeqConcatFcFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
...
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
virtual
~
SeqConvEltAddReluFusePass
()
{}
virtual
~
SeqConvEltAddReluFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"seqconv_eltadd_relu_fuse"
};
const
std
::
string
name_scope_
{
"seqconv_eltadd_relu_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
...
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
virtual
~
SeqPoolConcatFusePass
()
{}
virtual
~
SeqPoolConcatFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"seqpool_concat_fuse"
};
const
std
::
string
name_scope_
{
"seqpool_concat_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
...
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
virtual
~
SquaredMatSubFusePass
()
{}
virtual
~
SquaredMatSubFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
const
std
::
string
name_scope_
{
"squared_mat_sub_fuse"
};
const
std
::
string
name_scope_
{
"squared_mat_sub_fuse"
};
};
};
...
...
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
浏览文件 @
cf0511f2
...
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
...
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
virtual
~
TransposeFlattenConcatFusePass
()
{}
virtual
~
TransposeFlattenConcatFusePass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
cf0511f2
...
@@ -27,7 +27,7 @@ enum class OpRole {
...
@@ -27,7 +27,7 @@ enum class OpRole {
kForward
=
0x0000
,
kForward
=
0x0000
,
kBackward
=
0x0001
,
kBackward
=
0x0001
,
kOptimize
=
0x0002
,
kOptimize
=
0x0002
,
// RPC role is for send/recv rel
e
ated op
// RPC role is for send/recv related op
kRPC
=
0x0004
,
kRPC
=
0x0004
,
// Dist role is for split_byref/split_selected_rows/concat
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
// used for distributed training.
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
cf0511f2
...
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
...
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
this
->
InferShape
(
&
infer_shape_ctx
);
}
}
std
::
vector
<
KernelConfig
>*
OperatorWithKernel
::
GetKernelConfig
(
const
OpKernelType
&
key
)
const
{
auto
config_iter
=
kernel_configs_map_
.
find
(
key
);
std
::
vector
<
KernelConfig
>*
kernel_configs
=
nullptr
;
if
(
config_iter
!=
kernel_configs_map_
.
end
())
{
kernel_configs
=
&
(
config_iter
->
second
);
}
return
kernel_configs
;
}
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
const
platform
::
Place
&
place
)
const
{
RuntimeContext
ctx
(
Inputs
(),
Outputs
(),
scope
);
RuntimeContext
ctx
(
Inputs
(),
Outputs
(),
scope
);
...
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
...
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
));
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
...
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString
(
expected_kernel_key
));
KernelTypeToString
(
expected_kernel_key
));
}
}
std
::
vector
<
KernelConfig
>*
kernel_configs
=
GetKernelConfig
(
expected_kernel_key
);
// do data transformScope &transfer_scope;
// do data transformScope &transfer_scope;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
auto
*
transfer_scope
=
auto
*
transfer_scope
=
...
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
...
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
this
->
InferShape
(
&
infer_shape_ctx
);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
));
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
,
kernel_configs
));
if
(
!
transfered_inplace_vars
.
empty
())
{
if
(
!
transfered_inplace_vars
.
empty
())
{
// there is inplace variable has been transfered.
// there is inplace variable has been transfered.
...
...
paddle/fluid/framework/operator.h
浏览文件 @
cf0511f2
...
@@ -28,6 +28,7 @@ limitations under the License. */
...
@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
...
@@ -184,12 +185,30 @@ class OperatorBase {
...
@@ -184,12 +185,30 @@ class OperatorBase {
const
platform
::
Place
&
place
)
const
=
0
;
const
platform
::
Place
&
place
)
const
=
0
;
};
};
#ifdef PADDLE_WITH_CUDA
using
KernelConfig
=
boost
::
variant
<
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>>
;
#else
using
KernelConfig
=
boost
::
variant
<
boost
::
blank
>
;
#endif
using
OpKernelConfigsMap
=
std
::
unordered_map
<
OpKernelType
,
std
::
vector
<
KernelConfig
>
,
OpKernelType
::
Hash
>
;
class
ExecutionContext
{
class
ExecutionContext
{
public:
public:
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
,
const
platform
::
DeviceContext
&
device_context
,
const
RuntimeContext
&
ctx
)
const
RuntimeContext
&
ctx
,
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
)
{}
std
::
vector
<
KernelConfig
>*
configs
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
),
kernel_configs_
(
configs
)
{}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
...
@@ -398,11 +417,20 @@ class ExecutionContext {
...
@@ -398,11 +417,20 @@ class ExecutionContext {
return
temp_tensor
;
return
temp_tensor
;
}
}
template
<
typename
T
>
T
&
GetKernelConfig
(
int
idx
)
const
{
PADDLE_ENFORCE
(
kernel_configs_
&&
kernel_configs_
->
size
()
>
idx
,
"%s selected kernel doesn't have kernel config %lu <= %d"
,
op_
.
Type
().
c_str
(),
kernel_configs_
->
size
(),
idx
);
return
*
boost
::
get
<
std
::
shared_ptr
<
T
>>
(
kernel_configs_
->
at
(
idx
));
}
private:
private:
const
OperatorBase
&
op_
;
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
Scope
&
scope_
;
const
platform
::
DeviceContext
&
device_context_
;
const
platform
::
DeviceContext
&
device_context_
;
const
RuntimeContext
&
ctx_
;
const
RuntimeContext
&
ctx_
;
mutable
std
::
vector
<
KernelConfig
>*
kernel_configs_
;
};
};
template
<>
template
<>
...
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
...
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
std
::
vector
<
KernelConfig
>*
GetKernelConfig
(
const
OpKernelType
&
key
)
const
;
protected:
protected:
virtual
OpKernelType
GetKernelTypeForVar
(
virtual
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
...
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
...
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
exec_scope
)
const
;
const
Scope
&
exec_scope
)
const
;
protected:
mutable
OpKernelConfigsMap
kernel_configs_map_
;
};
};
extern
bool
OpSupportGPU
(
const
std
::
string
&
op_type
);
extern
bool
OpSupportGPU
(
const
std
::
string
&
op_type
);
...
...
paddle/fluid/framework/operator_kernel_configs.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <unordered_map>
#include <vector>
namespace
paddle
{
namespace
framework
{
// Not thread-safe. Should be owned per-kernel.
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
framework
::
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
cf0511f2
...
@@ -185,9 +185,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
...
@@ -185,9 +185,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
ParallelExecutor
::
ParallelExecutor
(
ParallelExecutor
::
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
...
@@ -221,12 +222,13 @@ ParallelExecutor::ParallelExecutor(
...
@@ -221,12 +222,13 @@ ParallelExecutor::ParallelExecutor(
PADDLE_ENFORCE
(
!
member_
->
use_cuda_
,
PADDLE_ENFORCE
(
!
member_
->
use_cuda_
,
"gpu mode does not support async_mode_ now!"
);
"gpu mode does not support async_mode_ now!"
);
}
}
std
::
unique_ptr
<
ir
::
Graph
>
temp_owned_graph
(
graph
);
// FIXME(Yancey1989): parallel graph mode get better performance
// FIXME(Yancey1989): parallel graph mode get better performance
// in GPU allreduce distributed training. Need an elegant way to
// in GPU allreduce distributed training. Need an elegant way to
// choice the execution strategy.
// choice the execution strategy.
build_strategy
.
enable_parallel_graph_
=
build_strategy
.
enable_parallel_graph_
=
EnableParallelGraphExecution
(
EnableParallelGraphExecution
(
main_program
,
exec_strategy
,
build_strategy
);
*
temp_owned_graph
,
exec_strategy
,
build_strategy
);
if
(
build_strategy
.
enable_parallel_graph_
)
if
(
build_strategy
.
enable_parallel_graph_
)
VLOG
(
0
)
<<
"The Executor would execute the graph by ParallelGraph "
VLOG
(
0
)
<<
"The Executor would execute the graph by ParallelGraph "
"Execution which can get better performance,"
"Execution which can get better performance,"
...
@@ -260,41 +262,45 @@ ParallelExecutor::ParallelExecutor(
...
@@ -260,41 +262,45 @@ ParallelExecutor::ParallelExecutor(
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevices
(
bcast_vars
);
BCastParamsToDevices
(
bcast_vars
);
}
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
// ncclOp
std
::
unique_ptr
<
ir
::
Graph
>
graph
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
build_strategy
.
async_mode_
&&
!
build_strategy
.
is_distribution_
)
{
if
(
build_strategy
.
async_mode_
&&
!
build_strategy
.
is_distribution_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
temp_owned_
graph
=
build_strategy
.
Apply
(
main_program
,
{
member_
->
places_
[
0
]},
loss_var_name
,
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
)
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
member_
->
nranks_
,
{
member_
->
local_scopes_
[
0
]},
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
}
else
{
}
else
{
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
)
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
}
}
#else
#else
if
(
build_strategy
.
async_mode_
&&
!
build_strategy
.
is_distribution_
)
{
if
(
build_strategy
.
async_mode_
&&
!
build_strategy
.
is_distribution_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
build_strategy
.
Apply
(
main_program
,
{
member_
->
places_
[
0
]},
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
)
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
member_
->
nranks_
,
member_
->
use_cuda_
);
member_
->
nranks_
,
member_
->
use_cuda_
);
}
else
{
}
else
{
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
)
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
member_
->
use_cuda_
);
}
}
#endif
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
auto
max_memory_size
=
GetEagerDeletionThreshold
();
VLOG
(
10
)
<<
"Eager Deletion Threshold "
VLOG
(
10
)
<<
"Eager Deletion Threshold "
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
if
(
max_memory_size
>=
0
)
{
if
(
max_memory_size
>=
0
)
{
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
graph
=
member_
static_cast
<
size_t
>
(
max_memory_size
));
->
PrepareGCAndRefCnts
(
std
::
move
(
temp_owned_graph
),
static_cast
<
size_t
>
(
max_memory_size
))
.
release
();
}
else
{
graph
=
temp_owned_graph
.
release
();
}
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// Step 3. Create vars in each scope. Passes may also create new vars.
...
@@ -328,15 +334,14 @@ ParallelExecutor::ParallelExecutor(
...
@@ -328,15 +334,14 @@ ParallelExecutor::ParallelExecutor(
VLOG
(
3
)
<<
"use AsyncSSAGraphExecutor"
;
VLOG
(
3
)
<<
"use AsyncSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
AsyncSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
AsyncSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)
));
graph
));
}
else
if
(
build_strategy
.
enable_parallel_graph_
)
{
}
else
if
(
build_strategy
.
enable_parallel_graph_
)
{
VLOG
(
3
)
<<
"use ParallelSSAGraphExecutor"
;
VLOG
(
3
)
<<
"use ParallelSSAGraphExecutor"
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
// TODO(Yancey1989): Remove passing in the main_program when
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
// allreduce_seq_pass doesn't need it as the attr.
member_
->
executor_
.
reset
(
new
details
::
ParallelSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
ParallelSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
main_program
,
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
std
::
move
(
graphs
[
0
])));
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
"Paddle should be compiled with CUDA for ParallelGraph Execution."
);
"Paddle should be compiled with CUDA for ParallelGraph Execution."
);
...
@@ -345,13 +350,11 @@ ParallelExecutor::ParallelExecutor(
...
@@ -345,13 +350,11 @@ ParallelExecutor::ParallelExecutor(
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
VLOG
(
3
)
<<
"use ThreadedSSAGraphExecutor"
;
VLOG
(
3
)
<<
"use ThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
std
::
move
(
graph
)));
}
else
{
}
else
{
VLOG
(
3
)
<<
"use FastThreadedSSAGraphExecutor"
;
VLOG
(
3
)
<<
"use FastThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
std
::
move
(
graph
)));
}
}
}
}
...
@@ -491,26 +494,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
...
@@ -491,26 +494,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
}
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
bool
ParallelExecutor
::
EnableParallelGraphExecution
(
bool
ParallelExecutor
::
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
{
const
BuildStrategy
&
build_strategy
)
const
{
if
(
!
FLAGS_enable_parallel_graph
)
return
false
;
if
(
!
FLAGS_enable_parallel_graph
)
return
false
;
bool
enable_parallel_graph
=
true
;
bool
enable_parallel_graph
=
true
;
for
(
ir
::
Node
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
for
(
auto
&
var_desc
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
node
->
Var
()
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
enable_parallel_graph
=
false
;
enable_parallel_graph
=
false
;
break
;
}
}
}
}
else
if
(
node
->
IsOp
()
&&
node
->
Op
())
{
// TODO(Yancey1989): support pserver mode
// TODO(Yancey1989): support pserver mode
for
(
auto
&
op_desc
:
main_program
.
Block
(
0
).
AllOps
())
{
if
(
node
->
Op
()
->
Type
()
==
"send"
||
node
->
Op
()
->
Type
()
==
"recv"
)
{
if
(
op_desc
->
Type
()
==
"send"
||
op_desc
->
Type
()
==
"recv"
)
{
enable_parallel_graph
=
false
;
enable_parallel_graph
=
false
;
break
;
break
;
}
}
}
}
}
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
...
@@ -520,13 +532,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
...
@@ -520,13 +532,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
return
enable_parallel_graph
;
return
enable_parallel_graph
;
}
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
cf0511f2
...
@@ -46,11 +46,11 @@ class ParallelExecutor {
...
@@ -46,11 +46,11 @@ class ParallelExecutor {
public:
public:
explicit
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
explicit
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
);
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
);
~
ParallelExecutor
();
~
ParallelExecutor
();
...
@@ -71,7 +71,7 @@ class ParallelExecutor {
...
@@ -71,7 +71,7 @@ class ParallelExecutor {
private:
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
bool
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
bool
EnableParallelGraphExecution
(
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
;
const
BuildStrategy
&
build_strategy
)
const
;
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
cf0511f2
...
@@ -27,6 +27,10 @@ limitations under the License. */
...
@@ -27,6 +27,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -37,10 +41,34 @@ class Tensor {
...
@@ -37,10 +41,34 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
public:
public:
inline
mkldnn
::
memory
::
format
format
()
const
{
return
format_
;
}
// TODO(jczaja): This is depracted and will be removed
inline
mkldnn
::
memory
::
format
format
()
const
{
if
(
layout_
==
DataLayout
::
kMKLDNN
)
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
mem_pd_
.
desc
().
data
.
format
);
}
else
{
return
mkldnn
::
memory
::
format
::
format_undef
;
}
}
inline
void
set_format
(
const
mkldnn
::
memory
::
format
format
)
{
// TODO(jczaja): This is depracted and will be removed
format_
=
format
;
inline
void
set_format
(
const
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
f32
)
{
mem_pd_
=
paddle
::
platform
::
create_prim_desc_from_format
(
paddle
::
framework
::
vectorize2int
(
dims
()),
fmt
,
data_type
);
layout_
=
DataLayout
::
kMKLDNN
;
}
inline
mkldnn
::
memory
::
primitive_desc
get_mkldnn_prim_desc
()
const
{
return
mem_pd_
;
}
inline
void
set_mkldnn_prim_desc
(
const
mkldnn
::
memory
::
primitive_desc
&
mem_pd
)
{
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_
=
mem_pd
;
layout_
=
DataLayout
::
kMKLDNN
;
}
}
protected:
protected:
...
@@ -48,12 +76,9 @@ class Tensor {
...
@@ -48,12 +76,9 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN
* @brief the detail format of memory block which have layout as kMKLDNN
*
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
*/
*/
mutable
mkldnn
::
memory
::
primitive_desc
mem_pd_
;
mkldnn
::
memory
::
format
format_
=
mkldnn
::
memory
::
format
::
format_undef
;
#endif
#endif
public:
public:
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
cf0511f2
...
@@ -50,8 +50,6 @@ class Scope;
...
@@ -50,8 +50,6 @@ class Scope;
}
// namespace framework
}
// namespace framework
namespace
operators
{
namespace
operators
{
template
<
typename
T
>
class
AlgorithmsCache
;
class
CudnnRNNCache
;
class
CudnnRNNCache
;
...
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
...
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#ifndef _WIN32
#ifndef _WIN32
ncclUniqueId
,
platform
::
Communicator
,
ncclUniqueId
,
platform
::
Communicator
,
#endif
#endif
operators
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
,
operators
::
CudnnRNNCache
,
operators
::
CudnnRNNCache
,
#endif
#endif
int
,
float
>
;
int
,
float
>
;
...
...
paddle/fluid/imperative/layer.cc
浏览文件 @
cf0511f2
...
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
...
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
framework
::
Scope
scope
;
framework
::
Scope
scope
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
);
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
,
nullptr
));
}
}
}
}
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
cf0511f2
...
@@ -44,8 +44,13 @@ class PreparedOp {
...
@@ -44,8 +44,13 @@ class PreparedOp {
PreparedOp
(
const
framework
::
OperatorBase
&
op
,
PreparedOp
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
RuntimeContext
&
ctx
,
const
framework
::
RuntimeContext
&
ctx
,
framework
::
OperatorWithKernel
::
OpKernelFunc
func
,
framework
::
OperatorWithKernel
::
OpKernelFunc
func
,
platform
::
DeviceContext
*
dev_ctx
)
platform
::
DeviceContext
*
dev_ctx
,
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
)
{}
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
)
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
),
kernel_configs
(
kernel_configs
)
{}
static
PreparedOp
Prepare
(
const
framework
::
RuntimeContext
&
ctx
,
static
PreparedOp
Prepare
(
const
framework
::
RuntimeContext
&
ctx
,
const
framework
::
OperatorWithKernel
&
op
,
const
framework
::
OperatorWithKernel
&
op
,
...
@@ -64,8 +69,9 @@ class PreparedOp {
...
@@ -64,8 +69,9 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
framework
::
OperatorWithKernel
::
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
auto
expected_kernel_key
=
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
));
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
@@ -83,7 +89,9 @@ class PreparedOp {
...
@@ -83,7 +89,9 @@ class PreparedOp {
PADDLE_THROW
(
"op %s does not have kernel for %s"
,
op
.
Type
(),
PADDLE_THROW
(
"op %s does not have kernel for %s"
,
op
.
Type
(),
KernelTypeToString
(
expected_kernel_key
));
KernelTypeToString
(
expected_kernel_key
));
}
}
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
);
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
=
op
.
GetKernelConfig
(
expected_kernel_key
);
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
,
kernel_configs
);
}
}
inline
platform
::
DeviceContext
*
GetDeviceContext
()
const
{
return
dev_ctx
;
}
inline
platform
::
DeviceContext
*
GetDeviceContext
()
const
{
return
dev_ctx
;
}
...
@@ -92,6 +100,7 @@ class PreparedOp {
...
@@ -92,6 +100,7 @@ class PreparedOp {
const
framework
::
RuntimeContext
&
ctx
;
const
framework
::
RuntimeContext
&
ctx
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
platform
::
DeviceContext
*
dev_ctx
;
platform
::
DeviceContext
*
dev_ctx
;
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
;
};
};
class
OpBase
;
class
OpBase
;
...
@@ -105,23 +114,23 @@ class VarBase {
...
@@ -105,23 +114,23 @@ class VarBase {
public:
public:
VarBase
()
:
VarBase
(
new
framework
::
Variable
(),
new
VarBase
(
true
))
{}
VarBase
()
:
VarBase
(
new
framework
::
Variable
(),
new
VarBase
(
true
))
{}
// Owns `var` and `grad`
explicit
VarBase
(
bool
stop_gradient
)
:
VarBase
(
new
framework
::
Variable
(),
stop_gradient
?
nullptr
:
new
VarBase
(
true
),
stop_gradient
)
{}
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
)
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
)
:
VarBase
(
var
,
grad
,
false
)
{}
private:
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
,
bool
stop_gradient
)
:
var_desc_
(
nullptr
),
:
var_desc_
(
nullptr
),
var_
(
var
),
var_
(
var
),
grads_
(
grad
),
grads_
(
grad
),
stop_gradient_
(
false
),
pre_op_
(
nullptr
),
pre_op_out_idx_
(
-
1
)
{}
explicit
VarBase
(
bool
stop_gradient
)
:
var_desc_
(
nullptr
),
var_
(
new
framework
::
Variable
()),
grads_
(
stop_gradient
?
nullptr
:
new
VarBase
(
true
)),
stop_gradient_
(
stop_gradient
),
stop_gradient_
(
stop_gradient
),
pre_op_
(
nullptr
),
pre_op_
(
nullptr
),
pre_op_out_idx_
(
-
1
)
{}
pre_op_out_idx_
(
-
1
)
{}
public:
virtual
~
VarBase
()
{
virtual
~
VarBase
()
{
if
(
var_
)
{
if
(
var_
)
{
delete
var_
;
delete
var_
;
...
@@ -132,11 +141,13 @@ class VarBase {
...
@@ -132,11 +141,13 @@ class VarBase {
}
}
}
}
OpBase
*
PreOp
()
const
{
return
pre_op_
;
}
inline
OpBase
*
PreOp
()
const
{
return
pre_op_
;
}
int
PreOpOutIdx
()
const
{
return
pre_op_out_idx_
;
}
in
line
in
t
PreOpOutIdx
()
const
{
return
pre_op_out_idx_
;
}
void
SetStopGradient
(
bool
stop_gradient
)
{
stop_gradient_
=
stop_gradient
;
}
inline
void
SetStopGradient
(
bool
stop_gradient
)
{
bool
IsStopGradient
()
const
{
return
stop_gradient_
;
}
stop_gradient_
=
stop_gradient
;
}
inline
bool
IsStopGradient
()
const
{
return
stop_gradient_
;
}
void
RunBackward
();
void
RunBackward
();
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
cf0511f2
...
@@ -14,6 +14,8 @@
...
@@ -14,6 +14,8 @@
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/tracer.h"
#include <set>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -66,8 +68,9 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
...
@@ -66,8 +68,9 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
return
result
;
return
result
;
}
}
void
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
std
::
set
<
std
::
string
>
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
Place
expected_place
,
const
platform
::
Place
expected_place
,
const
bool
stop_gradient
)
{
const
bool
stop_gradient
)
{
std
::
map
<
std
::
string
,
VarBase
*>
vars
;
std
::
map
<
std
::
string
,
VarBase
*>
vars
;
...
@@ -76,6 +79,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -76,6 +79,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VLOG
(
3
)
<<
"tracer tracing "
<<
op_desc
->
Type
();
VLOG
(
3
)
<<
"tracer tracing "
<<
op_desc
->
Type
();
op_desc
->
InferShape
(
*
block
);
op_desc
->
InferShape
(
*
block
);
op_desc
->
InferVarType
(
block
);
op_desc
->
InferVarType
(
block
);
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_base
=
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_base
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
...
@@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
invars
.
emplace_back
(
inp
->
var_
);
invars
.
emplace_back
(
inp
->
var_
);
vars
[
inp
->
var_desc_
->
Name
()]
=
inp
;
vars
[
inp
->
var_desc_
->
Name
()]
=
inp
;
if
(
inp
->
PreOp
())
{
if
(
inp
->
PreOp
()
&&
!
inp
->
IsStopGradient
()
)
{
op
->
pre_ops_
[
it
.
first
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_
[
it
.
first
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_out_idx_
[
it
.
first
].
push_back
(
inp
->
PreOpOutIdx
());
op
->
pre_ops_out_idx_
[
it
.
first
].
push_back
(
inp
->
PreOpOutIdx
());
}
else
{
}
else
{
...
@@ -138,8 +142,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -138,8 +142,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
func
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
));
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
,
prepared_op
.
kernel_configs
));
std
::
set
<
std
::
string
>
vars_saved_for_backward
;
if
(
!
stop_gradient
)
{
if
(
!
stop_gradient
)
{
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
...
@@ -160,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -160,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
// Forward inputs or outputs.
// Forward inputs or outputs.
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
vars_saved_for_backward
.
insert
(
it
.
first
);
}
else
{
}
else
{
VarBase
*
var
=
vars
[
var_it
->
second
];
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
...
@@ -193,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -193,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
}
}
op
->
block_
=
block
;
op
->
block_
=
block
;
return
vars_saved_for_backward
;
}
}
std
::
vector
<
VarBase
*>
Tracer
::
PyTrace
(
OpBase
*
op
,
std
::
vector
<
VarBase
*>
Tracer
::
PyTrace
(
OpBase
*
op
,
...
@@ -202,7 +211,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
...
@@ -202,7 +211,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
op
->
input_vars_
[
PyLayer
::
kFwdInp
]
=
inputs
;
op
->
input_vars_
[
PyLayer
::
kFwdInp
]
=
inputs
;
op
->
output_vars_
[
PyLayer
::
kFwdOut
]
=
PyLayer
::
Apply
(
op
->
forward_id_
,
inputs
);
op
->
output_vars_
[
PyLayer
::
kFwdOut
]
=
PyLayer
::
Apply
(
op
->
forward_id_
,
inputs
);
for
(
VarBase
*
inp
:
inputs
)
{
for
(
VarBase
*
inp
:
inputs
)
{
if
(
inp
->
PreOp
())
{
if
(
inp
->
PreOp
()
&&
!
inp
->
IsStopGradient
()
)
{
op
->
pre_ops_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_out_idx_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOpOutIdx
());
op
->
pre_ops_out_idx_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOpOutIdx
());
}
else
{
}
else
{
...
...
paddle/fluid/imperative/tracer.h
浏览文件 @
cf0511f2
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#pragma once
#pragma once
#include <map>
#include <map>
#include <set>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -43,8 +44,9 @@ class Tracer {
...
@@ -43,8 +44,9 @@ class Tracer {
virtual
~
Tracer
()
{}
virtual
~
Tracer
()
{}
void
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
std
::
set
<
std
::
string
>
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
Place
expected_place
,
const
platform
::
Place
expected_place
,
const
bool
stop_gradient
=
false
);
const
bool
stop_gradient
=
false
);
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
cf0511f2
...
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
...
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
params_file_
);
CP_MEMBER
(
params_file_
);
CP_MEMBER
(
model_from_memory_
);
// the memory model reuses prog_file_ and
CP_MEMBER
(
model_from_memory_
);
// the memory model reuses prog_file_ and
// params_file_ fields.
// params_file_ fields.
// Gpu rel
e
ated.
// Gpu related.
CP_MEMBER
(
use_gpu_
);
CP_MEMBER
(
use_gpu_
);
CP_MEMBER
(
device_id_
);
CP_MEMBER
(
device_id_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
...
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
...
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
static_memory_optim_
);
CP_MEMBER
(
static_memory_optim_
);
CP_MEMBER
(
static_memory_optim_force_update_
);
CP_MEMBER
(
static_memory_optim_force_update_
);
// TensorRT rel
e
ated.
// TensorRT related.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
// MKLDNN rel
e
ated.
// MKLDNN related.
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
cf0511f2
...
@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
...
@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
VLOG
(
3
)
<<
"create AnalysisConfig"
;
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
())
{
if
(
config
.
use_gpu
())
{
// 1. GPU mem
ero
y
// 1. GPU mem
or
y
PADDLE_ENFORCE_GT
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_GT
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_GE
(
config
.
gpu_device_id
(),
0
,
"Invalid device id %d"
,
PADDLE_ENFORCE_GE
(
config
.
gpu_device_id
(),
0
,
"Invalid device id %d"
,
config
.
gpu_device_id
());
config
.
gpu_device_id
());
...
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
...
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return
need
;
return
need
;
}
}
std
::
string
AnalysisPredictor
::
GetSeria
z
lizedProgram
()
const
{
std
::
string
AnalysisPredictor
::
GetSerializedProgram
()
const
{
return
inference_program_
->
Proto
()
->
SerializeAsString
();
return
inference_program_
->
Proto
()
->
SerializeAsString
();
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
cf0511f2
...
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
void
SetMkldnnThreadID
(
int
tid
);
void
SetMkldnnThreadID
(
int
tid
);
std
::
string
GetSeria
z
lizedProgram
()
const
override
;
std
::
string
GetSerializedProgram
()
const
override
;
protected:
protected:
// For memory optimization.
// For memory optimization.
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
cf0511f2
...
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
...
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
{
{
// The first predictor help to cache the memory optimize strategy.
// The first predictor help to cache the memory optimize strategy.
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
LOG
(
INFO
)
<<
"serialized program: "
<<
predictor
->
GetSeria
z
lizedProgram
();
LOG
(
INFO
)
<<
"serialized program: "
<<
predictor
->
GetSerializedProgram
();
ASSERT_FALSE
(
predictor
->
GetSeria
z
lizedProgram
().
empty
());
ASSERT_FALSE
(
predictor
->
GetSerializedProgram
().
empty
());
// Run several times to check the parameters are not reused by mistake.
// Run several times to check the parameters are not reused by mistake.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
cf0511f2
...
@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
...
@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
const
NativeConfig
&
config
)
{
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
const
NativeConfig
&
config
)
{
VLOG
(
3
)
<<
"create NativePaddlePredictor"
;
VLOG
(
3
)
<<
"create NativePaddlePredictor"
;
if
(
config
.
use_gpu
)
{
if
(
config
.
use_gpu
)
{
// 1. GPU mem
ero
y
// 1. GPU mem
or
y
PADDLE_ENFORCE_GE
(
PADDLE_ENFORCE_GE
(
config
.
fraction_of_gpu_memory
,
0.
f
,
config
.
fraction_of_gpu_memory
,
0.
f
,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"
);
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"
);
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
cf0511f2
...
@@ -212,12 +212,12 @@ struct AnalysisConfig {
...
@@ -212,12 +212,12 @@ struct AnalysisConfig {
std
::
string
prog_file_
;
std
::
string
prog_file_
;
std
::
string
params_file_
;
std
::
string
params_file_
;
// GPU rel
e
ated.
// GPU related.
bool
use_gpu_
{
false
};
bool
use_gpu_
{
false
};
int
device_id_
{
0
};
int
device_id_
{
0
};
uint64_t
memory_pool_init_size_mb_
{
100
};
// initial size is 100MB.
uint64_t
memory_pool_init_size_mb_
{
100
};
// initial size is 100MB.
// TensorRT rel
e
ated.
// TensorRT related.
bool
use_tensorrt_
{
false
};
bool
use_tensorrt_
{
false
};
// For workspace_size, refer it from here:
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
cf0511f2
...
@@ -248,7 +248,7 @@ class PaddlePredictor {
...
@@ -248,7 +248,7 @@ class PaddlePredictor {
/** \brief Get the serialized model program that executes in inference phase.
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
* Its data type is ProgramDesc, which is a protobuf message.
*/
*/
virtual
std
::
string
GetSeria
z
lizedProgram
()
const
{
virtual
std
::
string
GetSerializedProgram
()
const
{
assert
(
false
);
// Force raise error.
assert
(
false
);
// Force raise error.
return
"NotImplemented"
;
return
"NotImplemented"
;
}
}
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
...
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn2
${
RNN2_INSTALL_DIR
}
analyzer_rnn2_tester.cc
)
inference_analysis_api_test
(
test_analyzer_rnn2
${
RNN2_INSTALL_DIR
}
analyzer_rnn2_tester.cc
)
# TODO(luotao, Superjom) Disable DAM test, temporarily fix
# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
# After inference framework refactor, will reopen it.
# normal DAM
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL
)
#
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
...
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
sample_prob
tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
...
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
...
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
endif
()
endif
()
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
add_subdirectory
(
benchmark
)
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
cf0511f2
...
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
...
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
,
nullptr
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
...
...
paddle/fluid/operators/beam_search_decode_op.h
浏览文件 @
cf0511f2
...
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
...
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
auto
cpu_place
=
std
::
unique_ptr
<
paddle
::
platform
::
CPUPlace
>
(
auto
cpu_place
=
std
::
unique_ptr
<
paddle
::
platform
::
CPUPlace
>
(
new
paddle
::
platform
::
CPUPlace
());
new
paddle
::
platform
::
CPUPlace
());
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
.
get
()
);
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
);
framework
::
LoD
lod
;
framework
::
LoD
lod
;
lod
.
push_back
(
source_level_lod
);
lod
.
push_back
(
source_level_lod
);
...
...
paddle/fluid/operators/benchmark/CMakeLists.txt
0 → 100644
浏览文件 @
cf0511f2
cc_test
(
op_tester SRCS op_tester.cc op_tester_config.cc
DEPS memory timer framework_proto proto_desc lod_tensor op_registry
device_context scope
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
paddle/fluid/operators/benchmark/op_tester.cc
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h"
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
DEFINE_string
(
op_config_list
,
""
,
"Path of op config file."
);
void
OpTester
::
Init
(
const
std
::
string
&
filename
)
{
Init
(
OpTesterConfig
(
filename
));
}
void
OpTester
::
Init
(
const
OpTesterConfig
&
config
)
{
config_
=
config
;
auto
&
op_desc_info
=
framework
::
OpInfoMap
::
Instance
();
// Initialize the OpDesc
if
(
op_desc_info
.
Has
(
config_
.
op_type
))
{
type_
=
config_
.
op_type
;
op_desc_
.
SetType
(
config_
.
op_type
);
CreateInputVarDesc
();
CreateOutputVarDesc
();
}
else
{
LOG
(
FATAL
)
<<
"Op
\"
"
<<
config_
.
op_type
<<
"
\"
is not registered."
;
}
if
(
config_
.
device_id
>=
0
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device_id
);
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
}
framework
::
InitDevices
(
false
);
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
op_
=
framework
::
OpRegistry
::
CreateOp
(
op_desc_
);
CreateVariables
(
scope_
.
get
());
}
void
OpTester
::
Run
()
{
if
(
config_
.
print_debug_string
)
{
LOG
(
INFO
)
<<
DebugString
();
}
// Warm up
RunImpl
();
platform
::
Timer
timer
;
if
(
config_
.
profile
)
{
if
(
platform
::
is_cpu_place
(
place_
))
{
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kCPU
);
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kAll
);
platform
::
SetDeviceId
(
config_
.
device_id
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
timer
.
Start
();
for
(
int
i
=
config_
.
repeat
;
i
>
0
;
--
i
)
{
RunImpl
();
}
timer
.
Pause
();
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kDefault
,
"op_tester_profiler"
);
}
else
{
timer
.
Start
();
for
(
int
i
=
config_
.
repeat
;
i
>
0
;
--
i
)
{
RunImpl
();
}
timer
.
Pause
();
}
config_
.
runtime
=
timer
.
ElapsedMS
()
/
config_
.
repeat
;
LOG
(
INFO
)
<<
"=== Run "
<<
config_
.
repeat
<<
" times, latency: "
<<
config_
.
runtime
<<
" ms ==="
;
}
void
OpTester
::
RunImpl
()
{
op_
->
Run
(
*
scope_
,
place_
);
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
scope_
->
DropKids
();
}
std
::
vector
<
std
::
string
>
OpTester
::
GetOpProtoInputNames
()
{
std
::
vector
<
std
::
string
>
input_names
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
for
(
int
i
=
0
;
i
!=
proto
.
inputs_size
();
++
i
)
{
const
auto
&
input
=
proto
.
inputs
(
i
);
input_names
.
push_back
(
input
.
name
());
}
return
input_names
;
}
std
::
vector
<
std
::
string
>
OpTester
::
GetOpProtoOutputNames
()
{
std
::
vector
<
std
::
string
>
output_names
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
for
(
int
i
=
0
;
i
!=
proto
.
outputs_size
();
++
i
)
{
const
auto
&
output
=
proto
.
outputs
(
i
);
output_names
.
push_back
(
output
.
name
());
}
return
output_names
;
}
void
OpTester
::
CreateInputVarDesc
()
{
std
::
vector
<
std
::
string
>
input_names
=
GetOpProtoInputNames
();
for
(
auto
&
name
:
input_names
)
{
const
OpInputConfig
*
input
=
config_
.
GetInput
(
name
);
if
(
input
==
nullptr
)
{
LOG
(
FATAL
)
<<
"The input "
<<
name
<<
" of op "
<<
config_
.
op_type
<<
" is not correctlly provided."
;
}
std
::
string
var_name
=
config_
.
op_type
+
"."
+
name
;
framework
::
VarDesc
*
var
=
Var
(
var_name
);
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
var
->
SetShape
(
input
->
dims
);
op_desc_
.
SetInput
(
name
,
{
var_name
});
inputs_
.
push_back
(
var_name
);
}
}
void
OpTester
::
CreateOutputVarDesc
()
{
std
::
vector
<
std
::
string
>
output_names
=
GetOpProtoOutputNames
();
for
(
auto
&
name
:
output_names
)
{
std
::
string
var_name
=
config_
.
op_type
+
"."
+
name
;
framework
::
VarDesc
*
var
=
Var
(
var_name
);
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
op_desc_
.
SetOutput
(
name
,
{
var_name
});
outputs_
.
push_back
(
var_name
);
}
}
framework
::
VarDesc
*
OpTester
::
Var
(
const
std
::
string
&
name
)
{
auto
it
=
vars_
.
find
(
name
);
if
(
it
!=
vars_
.
end
())
{
return
it
->
second
.
get
();
}
auto
*
var
=
new
framework
::
VarDesc
(
name
);
vars_
[
name
].
reset
(
var
);
return
var
;
}
template
<
typename
T
>
void
OpTester
::
SetupTensor
(
framework
::
LoDTensor
*
tensor
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
)
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
T
*
ptr
=
tensor
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
for
(
int
i
=
0
;
i
<
tensor
->
numel
();
++
i
)
{
ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
else
{
framework
::
LoDTensor
cpu_tensor
;
T
*
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
TensorCopySync
(
cpu_tensor
,
place_
,
tensor
);
}
}
void
OpTester
::
CreateVariables
(
framework
::
Scope
*
scope
)
{
for
(
auto
&
item
:
vars_
)
{
auto
&
var
=
item
.
second
;
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
framework
::
InitializeVariable
(
ptr
,
var
->
GetType
());
if
(
var
->
Persistable
())
{
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
// Allocate memory for input tensor
for
(
auto
&
name
:
inputs_
)
{
VLOG
(
3
)
<<
"Allocate memory for tensor "
<<
name
;
auto
&
var_desc
=
vars_
[
name
];
std
::
vector
<
int64_t
>
shape
=
var_desc
->
GetShape
();
auto
*
var
=
scope
->
Var
(
name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
SetupTensor
<
float
>
(
tensor
,
shape
,
static_cast
<
float
>
(
0.0
),
static_cast
<
float
>
(
1.0
));
}
}
static
std
::
string
GenSpaces
(
int
count
)
{
std
::
stringstream
ss
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
ss
<<
" "
;
}
return
ss
.
str
();
}
std
::
string
OpTester
::
DebugString
()
{
std
::
stringstream
ss
;
int
count
=
0
;
for
(
auto
&
item
:
vars_
)
{
auto
&
var
=
item
.
second
;
ss
<<
GenSpaces
(
count
++
)
<<
"vars {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"name:
\"
"
<<
var
->
Name
()
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"type: {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"type: LOD_TENSOR
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"lod_tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"tensor {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP32
\n
"
;
std
::
vector
<
int64_t
>
shape
=
var
->
GetShape
();
for
(
auto
d
:
shape
)
{
ss
<<
GenSpaces
(
count
)
<<
"dims: "
<<
d
<<
"
\n
"
;
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"persistable: "
<<
var
->
Persistable
()
<<
"
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
count
++
)
<<
"ops {
\n
"
;
for
(
auto
&
name
:
op_desc_
.
InputNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"inputs {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"parameters:
\"
"
<<
name
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"arguments:
\"
"
<<
op_desc_
.
Input
(
name
)[
0
]
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
for
(
auto
&
name
:
op_desc_
.
OutputNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"outputs {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"parameters:
\"
"
<<
name
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"arguments:
\"
"
<<
op_desc_
.
Output
(
name
)[
0
]
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
count
)
<<
"type: "
<<
op_desc_
.
Type
()
<<
"
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
return
ss
.
str
();
}
TEST
(
op_tester
,
base
)
{
OpTester
tester
;
if
(
!
FLAGS_op_config_list
.
empty
())
{
tester
.
Init
(
FLAGS_op_config_list
);
}
else
{
OpTesterConfig
config
;
config
.
op_type
=
"elementwise_add"
;
config
.
inputs
.
resize
(
2
);
config
.
inputs
[
0
].
name
=
"X"
;
config
.
inputs
[
0
].
dims
=
{
64
,
64
};
config
.
inputs
[
1
].
name
=
"Y"
;
config
.
inputs
[
1
].
dims
=
{
64
,
1
};
tester
.
Init
(
config
);
}
tester
.
Run
();
}
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
class
OpTester
{
public:
OpTester
()
{}
void
Init
(
const
std
::
string
&
filename
);
void
Init
(
const
OpTesterConfig
&
config
);
void
Run
();
std
::
string
DebugString
();
private:
std
::
vector
<
std
::
string
>
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoOutputNames
();
void
CreateInputVarDesc
();
void
CreateOutputVarDesc
();
framework
::
VarDesc
*
Var
(
const
std
::
string
&
name
);
void
CreateVariables
(
framework
::
Scope
*
scope
);
template
<
typename
T
>
void
SetupTensor
(
framework
::
LoDTensor
*
input
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
);
void
RunImpl
();
private:
OpTesterConfig
config_
;
std
::
string
type_
;
framework
::
OpDesc
op_desc_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
framework
::
VarDesc
>>
vars_
;
std
::
vector
<
std
::
string
>
inputs_
;
std
::
vector
<
std
::
string
>
outputs_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
platform
::
Place
place_
;
std
::
unique_ptr
<
framework
::
Scope
>
scope_
;
};
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester_config.cc
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
static
const
char
kStartSeparator
[]
=
"{"
;
static
const
char
kEndSeparator
[]
=
"}"
;
static
const
char
kSepBetweenItems
[]
=
";"
;
static
bool
StartWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
find
(
substr
)
==
0
;
}
static
bool
EndWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
rfind
(
substr
)
==
(
str
.
length
()
-
substr
.
length
());
}
static
void
EraseEndSep
(
std
::
string
*
str
)
{
std
::
string
substr
=
kSepBetweenItems
;
if
(
EndWith
(
*
str
,
substr
))
{
str
->
erase
(
str
->
length
()
-
substr
.
length
(),
str
->
length
());
}
}
static
std
::
vector
<
int64_t
>
ParseDims
(
std
::
string
dims_str
)
{
std
::
vector
<
int64_t
>
dims
;
std
::
string
token
;
std
::
istringstream
token_stream
(
dims_str
);
while
(
std
::
getline
(
token_stream
,
token
,
'x'
))
{
dims
.
push_back
(
std
::
stoi
(
token
));
}
return
dims
;
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
std
::
string
dims_str
;
is
>>
dims_str
;
dims
=
ParseDims
(
dims_str
);
}
}
}
}
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
filename
.
c_str
());
Init
(
fin
);
}
void
OpTesterConfig
::
Init
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"op_type"
||
sep
==
"op_type:"
)
{
is
>>
op_type
;
}
else
if
(
sep
==
"device_id"
||
sep
==
"device_id:"
)
{
is
>>
device_id
;
}
else
if
(
sep
==
"repeat"
||
sep
==
"repeat:"
)
{
is
>>
repeat
;
}
else
if
(
sep
==
"profile"
||
sep
==
"profile:"
)
{
is
>>
profile
;
}
else
if
(
sep
==
"print_debug_string"
||
sep
==
"print_debug_string:"
)
{
is
>>
print_debug_string
;
}
else
if
(
sep
==
"input"
||
sep
==
"input:"
)
{
OpInputConfig
input_config
(
is
);
inputs
.
push_back
(
input_config
);
}
}
}
}
const
OpInputConfig
*
OpTesterConfig
::
GetInput
(
const
std
::
string
&
name
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
if
(
inputs
[
i
].
name
==
name
)
{
return
&
inputs
[
i
];
}
}
return
nullptr
;
}
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester_config.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <istream>
#include <string>
#include <vector>
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
struct
OpInputConfig
{
OpInputConfig
()
{}
explicit
OpInputConfig
(
std
::
istream
&
is
);
std
::
string
name
;
std
::
vector
<
int64_t
>
dims
;
};
struct
OpTesterConfig
{
OpTesterConfig
()
{}
explicit
OpTesterConfig
(
const
std
::
string
&
filename
);
void
Init
(
std
::
istream
&
is
);
const
OpInputConfig
*
GetInput
(
const
std
::
string
&
name
);
std
::
string
op_type
;
std
::
vector
<
OpInputConfig
>
inputs
;
int
device_id
{
-
1
};
// CPU: -1
int
repeat
{
1
};
int
profile
{
0
};
int
print_debug_string
{
0
};
double
runtime
{
0.0
};
};
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
cf0511f2
...
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
...
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using
DataLayout
=
platform
::
DataLayout
;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
...
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
...
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
cudnn_workspace
=
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
framework
::
make_ddim
(
...
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
...
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dev_ctx
);
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
...
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
...
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>*
data_algo_cache
;
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>&
data_algo_cache
=
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdDataAlgoCache
))
{
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
(
data_algo_cache
=
0
);
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
else
{
data_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
data_algo
=
data_algo_cache
->
GetAlgorithm
(
data_algo
=
data_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
...
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
...
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
if
(
filter_grad
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>*
f_algo_cache
;
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>&
f_algo_cache
=
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdFilterAlgoCache
))
{
ctx
.
GetKernelConfig
<
f_algo_cache
=
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
(
1
);
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
else
{
f_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
filter_algo
=
f_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <functional>
#include <functional>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_uint64
(
conv_workspace_size_limit
);
...
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
...
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
5
;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
5
;
#endif
#endif
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
cf0511f2
...
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
...
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
...
@@ -139,37 +141,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -139,37 +141,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}
}
return
fwd_perf_stat
[
0
].
algo
;
return
fwd_perf_stat
[
0
].
algo
;
};
};
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
search_times
=
std
::
max
(
search_times
=
std
::
max
(
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
// TODO(dangqingqing): Unify this if-else.
if
(
search_times
>
0
)
{
if
(
search_times
>
0
)
{
// The searched algo will be cached by `search_times` times for
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// different input dimension. For other dimensions, select the algo
// of closest area.
// of closest area.
auto
var_name
=
ctx
.
Inputs
(
"AlgoCache"
)[
0
];
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
algo_cache
=
ctx
.
scope
()
.
FindVar
(
var_name
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
search_func
);
}
else
{
}
else
{
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
// TODO(qingqing) remove const_cast
algo_cache
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
())
->
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
dilations
,
0
,
search_func
);
}
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
cf0511f2
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#endif
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
...
@@ -109,8 +110,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
...
@@ -109,8 +110,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"
);
"float16 can only be used when CUDNN is used"
);
}
}
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
auto
type
=
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
,
customized_type_value
);
library
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
// TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
// to false. It should be fixed and then here should only create if library
// is kCUDNN.
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
());
configs
.
push_back
(
p
);
}
#endif
return
type
;
}
}
void
Conv2DOpMaker
::
Make
()
{
void
Conv2DOpMaker
::
Make
()
{
...
@@ -410,9 +423,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
...
@@ -410,9 +423,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
}
}
#endif
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
auto
type
=
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
,
ctx
.
GetPlace
(),
layout_
,
library_
,
customized_type_value
);
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
if
(
library_
==
framework
::
LibraryType
::
kCUDNN
)
{
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
());
configs
.
push_back
(
p
);
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
p2
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
());
configs
.
push_back
(
p2
);
}
}
#endif
return
type
;
}
}
class
Conv2dGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
class
Conv2dGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
...
paddle/fluid/operators/detection/yolov3_loss_op.cc
浏览文件 @
cf0511f2
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss."
)
"The ignore threshold to ignore confidence loss."
)
.
SetDefault
(
0.7
);
.
SetDefault
(
0.7
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
This operator generate
yolov3 loss by
given predict result and ground
This operator generate
s yolov3 loss based on
given predict result and ground
truth boxes.
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
The output of previous network is in shape [N, C, H, W], while H and W
should be the same,
specify the grid size, each grid point predict given
should be the same,
H and W specify the grid size, each grid point predict
number boxes, this given number is specified by anchors, it should be
given number boxes, this given number, which following will be represented as S,
half anchors length, which following will be represented as S. In the
is specified by the number of anchors, In the second dimension(the channel
second dimention(the channel dimention), C should be S * (class_num + 5),
dimension), C should be equal to S * (class_num + 5), class_num is the object
c
lass_num is the box categoriy number of source dataset(such as coco),
c
ategory number of source dataset(such as 80 in coco dataset), so in the
s
o in the second dimention, stores 4 box location coordinates x, y, w, h
s
econd(channel) dimension, apart from 4 box location coordinates x, y, w, h,
a
nd
confidence score of the box and class one-hot key of each anchor box.
a
lso includes
confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$
, the box predictions
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`
, the box predictions
correspnd to
:
should be as follows
:
$$
$$
b_x = \sigma(t_x) + c_x
b_x = \\sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
b_h = p_h e^{t_h}
$$
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
In the equation above, :math:`c_x, c_y` is the left top corner of current grid
is specified by anchors.
and :math:`p_w, p_h`
is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger th
e
n ignore
the max IoU should be 1, and if the anchor box has IoU bigger th
a
n ignore
thresh, the confidence score loss of this anchor box will be ignored.
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
Therefore, the yolov3 loss consist of three major parts, box location loss,
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
calculated as follow
s
.
$$
$$
weight_{box} = 2.0 - t_w * t_h
weight_{box} = 2.0 - t_w * t_h
$$
$$
Final loss will be represented as follow.
Final loss will be represented as follow
s
.
$$
$$
loss = (loss_{xy} + loss_{wh}) * weight_{box}
loss = (loss_{xy} + loss_{wh}) * weight_{box}
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
cf0511f2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
*
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
*
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
*
You may obtain a copy of the License at
You may obtain a copy of the License at
*
*
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
*
*
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
*
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
*
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
*
limitations under the License. */
limitations under the License. */
#include <random>
#include <random>
#include <string>
#include <string>
...
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
...
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
const
typename
jit
::
SeqPoolTuples
<
T
>::
attr_type
&
attr
)
{
const
typename
jit
::
SeqPoolTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
0
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
static_cast
<
size_t
>
(
0
)
);
int
w
=
yref
.
size
();
int
w
=
yref
.
size
();
std
::
vector
<
T
>
y
(
w
);
std
::
vector
<
T
>
y
(
w
);
const
T
*
x_data
=
x
.
data
();
const
T
*
x_data
=
x
.
data
();
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
cf0511f2
...
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value
.
output_value
=
out_t
.
data
<
T
>
();
lstm_value
.
output_value
=
out_t
.
data
<
T
>
();
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
T
cell_clip
=
0.0
;
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
gate_act
,
device_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
cell_clip
,
cell_act
,
cand_act
);
gate_act
,
cell_act
,
cand_act
);
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
}
}
...
@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
...
@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_value
.
output_value
=
nullptr
;
lstm_value
.
output_value
=
nullptr
;
lstm_grad
.
state_active_grad
=
nullptr
;
lstm_grad
.
state_active_grad
=
nullptr
;
int
cur_batch_size
=
bend
-
bstart
;
int
cur_batch_size
=
bend
-
bstart
;
T
cell_clip
=
0.0
;
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
cur_batch_size
,
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
if
(
n
>
0
)
{
if
(
n
>
0
)
{
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
...
...
paddle/fluid/operators/lstmp_op.cc
浏览文件 @
cf0511f2
...
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
...
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"C0"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"C0"
),
"Input(C0) of LSTMP operator should not be null after "
"Input(C0) of LSTMP operator should not be null after "
"Input(H0) provided."
);
"Input(H0) provided."
);
auto
h_dims
=
ctx
->
GetInputDim
(
"H0"
);
auto
c_dims
=
ctx
->
GetInputDim
(
"C0"
);
PADDLE_ENFORCE
(
h_dims
==
c_dims
,
"The dimension of Input(H0) and Input(C0) "
"should be the same."
);
ctx
->
SetOutputDim
(
"OrderedP0"
,
{
h_dims
[
0
],
proj_dims
[
1
]});
}
}
auto
b_dims
=
ctx
->
GetInputDim
(
"Bias"
);
auto
b_dims
=
ctx
->
GetInputDim
(
"Bias"
);
...
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"This LoDTensor is obtained in the forward and used in the "
"This LoDTensor is obtained in the forward and used in the "
"backward."
)
"backward."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"OrderedP0"
,
"(Tensor) the projection of the initial hidden state "
"H0. This is a tensor with shape (N x P), where N is the "
"batch size and P is the hidden size."
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"use_peepholes"
,
AddAttr
<
bool
>
(
"use_peepholes"
,
"(bool, defalut: True) "
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections."
)
"whether to enable diagonal/peephole connections."
)
...
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"(bool, defalut: False) "
"whether to compute reversed LSTMP."
)
"whether to compute reversed LSTMP."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"cell_clip"
,
"(float, defalut: 0.0) "
"Clip for Tensor for cell state tensor when clip value is "
"greater than 0.0"
)
.
SetDefault
(
0.0
);
AddAttr
<
float
>
(
"proj_clip"
,
"(float, defalut: 0.0) "
"Clip for Tensor for projection tensor when clip value is "
"greater than 0.0"
)
.
SetDefault
(
0.0
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"gate_activation"
,
"gate_activation"
,
"(string, default: sigmoid)"
"(string, default: sigmoid)"
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
cf0511f2
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/activation_op.h"
...
@@ -21,17 +22,50 @@ limitations under the License. */
...
@@ -21,17 +22,50 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
using
platform
::
Transform
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
class
_ClipFunctor
{
public:
explicit
_ClipFunctor
(
const
T
min
,
const
T
max
)
:
min_
(
min
),
max_
(
max
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
<
min_
)
return
min_
;
else
if
(
x
>
max_
)
return
max_
;
else
return
x
;
}
private:
T
min_
;
T
max_
;
};
template
<
typename
T
>
class
_ClipGradFunctor
{
public:
explicit
_ClipGradFunctor
(
const
T
min
,
const
T
max
)
:
min_
(
min
),
max_
(
max
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
,
const
T
&
y
)
const
{
return
(
y
>
min_
&&
y
<
max_
)
?
x
:
0
;
}
private:
T
min_
;
T
max_
;
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
const
framework
::
Tensor
&
src
,
...
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
...
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
ordered_proj0
=
ctx
.
Output
<
Tensor
>
(
"OrderedP0"
);
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
proj_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"proj_clip"
));
auto
cell_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"cell_clip"
));
auto
*
batch_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchGate"
);
auto
*
batch_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
proj_out
=
ctx
.
Output
<
LoDTensor
>
(
"Projection"
);
auto
*
proj_out
=
ctx
.
Output
<
LoDTensor
>
(
"Projection"
);
...
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
...
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
}
}
lstmp_value
.
prev_state_value
=
nullptr
;
lstmp_value
.
prev_state_value
=
nullptr
;
Tensor
ordered_c0
;
Tensor
ordered_c0
;
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
...
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
...
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
// Since the batch computing for LSTMP reorders the input sequence
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized hidden state also needs
// according to their length. The initialized hidden state also needs
// to reorder.
// to reorder.
Tensor
ordered_h0
;
ordered_proj0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
hidden_t0
,
order
,
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
&
ordered_h0
,
true
);
blas
.
MatMul
(
ordered_h0
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
blas
.
MatMul
(
ordered_h0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
ordered_proj0
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
ActCompute
(
cell_act
,
place
,
proj0_dev
,
proj0_dev
);
}
blas
.
MatMul
(
*
ordered_proj0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
}
...
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
...
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
lstmp_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstmp_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstmp_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
lstmp_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstmp_value
,
frame_size
,
cur_batch_size
,
gate_act
,
device_ctx
,
lstmp_value
,
frame_size
,
cur_batch_size
,
cell_clip
,
cell_act
,
cand_act
);
gate_act
,
cell_act
,
cand_act
);
lstmp_value
.
prev_state_value
=
lstmp_value
.
state_value
;
lstmp_value
.
prev_state_value
=
lstmp_value
.
state_value
;
blas
.
MatMul
(
hidden_t
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
blas
.
MatMul
(
hidden_t
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
proj_t
,
static_cast
<
T
>
(
0.0
));
&
proj_t
,
static_cast
<
T
>
(
0.0
));
...
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
...
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto
proj_t_dev
=
EigenMatrix
<
T
>::
From
(
proj_t
);
auto
proj_t_dev
=
EigenMatrix
<
T
>::
From
(
proj_t
);
ActCompute
(
cell_act
,
place
,
proj_t_dev
,
proj_t_dev
);
ActCompute
(
cell_act
,
place
,
proj_t_dev
,
proj_t_dev
);
}
}
if
(
proj_clip
&&
proj_clip
>
0.0
)
{
T
*
x_data
=
proj_t
.
data
<
T
>
();
int64_t
numel
=
proj_t
.
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
x_data
,
x_data
+
numel
,
x_data
,
_ClipFunctor
<
T
>
(
-
1.0
*
proj_clip
,
proj_clip
));
}
}
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
...
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto
*
proj_out
=
ctx
.
Input
<
LoDTensor
>
(
"Projection"
);
auto
*
proj_out
=
ctx
.
Input
<
LoDTensor
>
(
"Projection"
);
auto
*
cell_out
=
ctx
.
Input
<
LoDTensor
>
(
"Cell"
);
auto
*
cell_out
=
ctx
.
Input
<
LoDTensor
>
(
"Cell"
);
auto
proj_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"proj_clip"
));
auto
cell_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"cell_clip"
));
auto
*
batch_gate
=
ctx
.
Input
<
LoDTensor
>
(
"BatchGate"
);
auto
*
batch_gate
=
ctx
.
Input
<
LoDTensor
>
(
"BatchGate"
);
auto
*
batch_cell_pre_act
=
ctx
.
Input
<
LoDTensor
>
(
"BatchCellPreAct"
);
auto
*
batch_cell_pre_act
=
ctx
.
Input
<
LoDTensor
>
(
"BatchCellPreAct"
);
auto
*
batch_hidden
=
ctx
.
Input
<
LoDTensor
>
(
"BatchHidden"
);
auto
*
batch_hidden
=
ctx
.
Input
<
LoDTensor
>
(
"BatchHidden"
);
...
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto
*
bias_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
bias_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
ordered_proj0
=
ctx
.
Input
<
Tensor
>
(
"OrderedP0"
);
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
h0_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"H0"
));
auto
*
h0_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"H0"
));
...
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor
cur_proj
=
batch_proj
.
Slice
(
bstart
,
bend
);
Tensor
cur_proj
=
batch_proj
.
Slice
(
bstart
,
bend
);
Tensor
proj_g
=
batch_proj_g
.
Slice
(
bstart
,
bend
);
Tensor
proj_g
=
batch_proj_g
.
Slice
(
bstart
,
bend
);
if
(
proj_clip
&&
proj_clip
>
0.0
)
{
T
*
dx_data
=
proj_g
.
data
<
T
>
();
T
*
x_data
=
cur_proj
.
data
<
T
>
();
int64_t
numel
=
proj_g
.
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
dx_data
,
dx_data
+
numel
,
x_data
,
dx_data
,
_ClipGradFunctor
<
T
>
(
-
1.0
*
proj_clip
,
proj_clip
));
}
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
cur_proj_dev
=
EigenMatrix
<
T
>::
From
(
cur_proj
);
auto
cur_proj_dev
=
EigenMatrix
<
T
>::
From
(
cur_proj
);
auto
proj_g_dev
=
EigenMatrix
<
T
>::
From
(
proj_g
);
auto
proj_g_dev
=
EigenMatrix
<
T
>::
From
(
proj_g
);
...
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstmp_value
,
lstmp_grad
,
frame_size
,
cur_batch_size
,
device_ctx
,
lstmp_value
,
lstmp_grad
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
if
(
n
>
0
)
{
if
(
n
>
0
)
{
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
...
@@ -431,32 +480,15 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -431,32 +480,15 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
h0
,
order
,
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
h0
,
order
,
&
ordered_h0
,
true
);
&
ordered_h0
,
true
);
if
(
weight_g
)
{
if
(
weight_g
)
{
blas
.
MatMul
(
*
ordered_proj0
,
true
,
gate_g
,
false
,
blas
.
MatMul
(
ordered_h0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
)
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
}
}
if
(
h0
&&
(
h0_g
||
proj_weight_g
))
{
if
(
h0
&&
(
h0_g
||
proj_weight_g
))
{
ordered_h0_g
.
mutable_data
<
T
>
(
h0_g
->
dims
(),
ctx
.
GetPlace
());
ordered_h0_g
.
mutable_data
<
T
>
(
h0_g
->
dims
(),
ctx
.
GetPlace
());
Tensor
proj0_g
;
proj0_g
.
Resize
({
in_dims
[
0
],
proj_weight
->
dims
()[
1
]});
proj0_g
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
proj0_g
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
auto
proj0_g_dev
=
EigenMatrix
<
T
>::
From
(
proj0_g
);
ActGradCompute
(
cell_act
,
place
,
proj0_dev
,
proj0_dev
,
proj0_g_dev
,
proj0_g_dev
);
}
if
(
h0_g
)
{
blas
.
MatMul
(
proj0_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
}
}
if
(
proj_weight_g
)
{
blas
.
MatMul
(
ordered_h0
,
true
,
proj0_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
}
}
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -39,6 +39,7 @@ math_library(cross_entropy)
...
@@ -39,6 +39,7 @@ math_library(cross_entropy)
math_library
(
cos_sim_functor
)
math_library
(
cos_sim_functor
)
math_library
(
depthwise_conv DEPS cub
)
math_library
(
depthwise_conv DEPS cub
)
math_library
(
im2col
)
math_library
(
im2col
)
math_library
(
sample_prob
)
math_library
(
sampler
)
math_library
(
sampler
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
gru_compute DEPS activation_functions math_function
)
...
...
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
浏览文件 @
cf0511f2
...
@@ -32,7 +32,8 @@ namespace detail {
...
@@ -32,7 +32,8 @@ namespace detail {
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
naive_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
void
naive_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
T
r_value_in
;
T
r_value_in
;
...
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value_in
[
i
]
=
r_value_in
;
value_in
[
i
]
=
r_value_in
;
value_ig
[
i
]
=
r_value_ig
;
value_ig
[
i
]
=
r_value_ig
;
...
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
naive_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
void
naive_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
T
r_value_in
;
T
r_value_in
;
...
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
grad_in
[
i
]
=
r_grad_in
;
grad_in
[
i
]
=
r_grad_in
;
grad_ig
[
i
]
=
r_grad_ig
;
grad_ig
[
i
]
=
r_grad_ig
;
...
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
avx_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
void
avx_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
#ifdef __AVX__
#ifdef __AVX__
...
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value_in
[
i
]
=
r_value_in
;
value_in
[
i
]
=
r_value_in
;
value_ig
[
i
]
=
r_value_ig
;
value_ig
[
i
]
=
r_value_ig
;
...
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
avx_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
void
avx_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
#ifdef __AVX__
#ifdef __AVX__
...
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
grad_in
[
i
]
=
r_grad_in
;
grad_in
[
i
]
=
r_grad_in
;
grad_ig
[
i
]
=
r_grad_ig
;
grad_ig
[
i
]
=
r_grad_ig
;
...
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
...
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
cpu_lstm_forward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
void
cpu_lstm_forward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
ActivationType
active_gat
e
,
T
cell_clip
,
ActivationType
active_nod
e
,
ActivationType
active_state
)
{
ActivationType
active_
gate
,
ActivationType
active_
state
)
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
active_node
,
avx_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
cell_clip
,
active_gate
,
active_state
);
active_
node
,
active_
gate
,
active_state
);
}
else
{
}
else
{
naive_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
active_node
,
naive_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
cell_clip
,
active_gate
,
active_state
);
active_
node
,
active_
gate
,
active_state
);
}
}
}
}
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
cpu_lstm_backward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
void
cpu_lstm_backward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
active_node
,
avx_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
cell_clip
,
active_gate
,
active_state
);
active_
node
,
active_
gate
,
active_state
);
}
else
{
}
else
{
naive_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
naive_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
active_node
,
active_gate
,
active_state
);
}
}
}
}
...
...
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
浏览文件 @
cf0511f2
...
@@ -31,7 +31,8 @@ namespace detail {
...
@@ -31,7 +31,8 @@ namespace detail {
*/
*/
template
<
class
T
,
class
Op
,
bool
is_batch
>
template
<
class
T
,
class
Op
,
bool
is_batch
>
__global__
void
KeLstmForward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
__global__
void
KeLstmForward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
...
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value
.
gate_value
[
frame_idx
]
=
r_value_in
;
value
.
gate_value
[
frame_idx
]
=
r_value_in
;
value
.
gate_value
[
frame_idx
+
frame_size
]
=
r_value_ig
;
value
.
gate_value
[
frame_idx
+
frame_size
]
=
r_value_ig
;
...
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
...
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
template
<
class
T
,
class
Op
,
bool
is_batch
>
template
<
class
T
,
class
Op
,
bool
is_batch
>
__global__
void
KeLstmBackward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
__global__
void
KeLstmBackward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
...
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_grad_in
,
&
r_grad_ig
,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_grad_in
,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
&
cell_clip
,
active_gate
,
active_state
);
active_
node
,
active_
gate
,
active_state
);
grad
.
gate_grad
[
frame_idx
]
=
r_grad_in
;
grad
.
gate_grad
[
frame_idx
]
=
r_grad_in
;
grad
.
gate_grad
[
frame_idx
+
frame_size
]
=
r_grad_ig
;
grad
.
gate_grad
[
frame_idx
+
frame_size
]
=
r_grad_ig
;
...
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
...
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
gpu_lstm_forward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
void
gpu_lstm_forward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
ActivationType
active_gat
e
,
T
cell_clip
,
ActivationType
active_nod
e
,
ActivationType
active_state
)
{
ActivationType
active_
gate
,
ActivationType
active_
state
)
{
dim3
threads
;
dim3
threads
;
dim3
grid
;
dim3
grid
;
if
(
batch_size
==
1
)
{
if
(
batch_size
==
1
)
{
...
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
...
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
if
(
batch_size
==
1
)
{
if
(
batch_size
==
1
)
{
KeLstmForward
<
T
,
Op
,
KeLstmForward
<
T
,
Op
,
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frame_size
,
batch_size
,
active_node
,
active_gate
,
op
,
value
,
frame_size
,
batch_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
active_state
);
}
else
{
}
else
{
KeLstmForward
<
T
,
Op
,
KeLstmForward
<
T
,
Op
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frame_size
,
batch_size
,
active_node
,
active_gate
,
op
,
value
,
frame_size
,
batch_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
active_state
);
}
}
}
}
...
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
...
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template
<
class
T
,
class
Op
>
template
<
class
T
,
class
Op
>
void
gpu_lstm_backward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
void
gpu_lstm_backward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
dim3
threads
;
dim3
threads
;
...
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
...
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
if
(
batch_size
==
1
)
{
if
(
batch_size
==
1
)
{
KeLstmBackward
<
T
,
Op
,
KeLstmBackward
<
T
,
Op
,
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frame_size
,
batch_size
,
active_node
,
active_gat
e
,
op
,
value
,
grad
,
frame_size
,
batch_size
,
cell_clip
,
active_nod
e
,
active_state
);
active_
gate
,
active_
state
);
}
else
{
}
else
{
KeLstmBackward
<
T
,
Op
,
KeLstmBackward
<
T
,
Op
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frame_size
,
batch_size
,
active_node
,
active_gat
e
,
op
,
value
,
grad
,
frame_size
,
batch_size
,
cell_clip
,
active_nod
e
,
active_state
);
active_
gate
,
active_
state
);
}
}
}
}
...
...
paddle/fluid/operators/math/detail/lstm_kernel.h
浏览文件 @
cf0511f2
...
@@ -29,7 +29,7 @@ class lstm {
...
@@ -29,7 +29,7 @@ class lstm {
public:
public:
HOSTDEVICE
void
operator
()(
T
*
value_in
,
T
*
value_ig
,
T
*
value_fg
,
T
*
value_og
,
HOSTDEVICE
void
operator
()(
T
*
value_in
,
T
*
value_ig
,
T
*
value_fg
,
T
*
value_og
,
T
*
prev_state
,
T
*
state
,
T
*
state_atv
,
T
*
output
,
T
*
prev_state
,
T
*
state
,
T
*
state_atv
,
T
*
output
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
...
@@ -37,6 +37,15 @@ class lstm {
...
@@ -37,6 +37,15 @@ class lstm {
*
value_ig
=
activation
(
*
value_ig
+
(
*
prev_state
)
*
(
*
checkI
),
active_gate
);
*
value_ig
=
activation
(
*
value_ig
+
(
*
prev_state
)
*
(
*
checkI
),
active_gate
);
*
value_fg
=
activation
(
*
value_fg
+
(
*
prev_state
)
*
(
*
checkF
),
active_gate
);
*
value_fg
=
activation
(
*
value_fg
+
(
*
prev_state
)
*
(
*
checkF
),
active_gate
);
*
state
=
(
*
value_in
)
*
(
*
value_ig
)
+
(
*
prev_state
)
*
(
*
value_fg
);
*
state
=
(
*
value_in
)
*
(
*
value_ig
)
+
(
*
prev_state
)
*
(
*
value_fg
);
if
(
*
cell_clip
>
0.0
)
{
if
(
*
state
<
-
1.0
*
(
*
cell_clip
))
{
*
state
=
-
1.0
*
(
*
cell_clip
);
}
if
(
*
state
>
*
cell_clip
)
{
*
state
=
*
cell_clip
;
}
}
*
value_og
=
activation
(
*
value_og
+
(
*
state
)
*
(
*
checkO
),
active_gate
);
*
value_og
=
activation
(
*
value_og
+
(
*
state
)
*
(
*
checkO
),
active_gate
);
*
state_atv
=
activation
(
*
state
,
active_state
);
*
state_atv
=
activation
(
*
state
,
active_state
);
*
output
=
(
*
value_og
)
*
(
*
state_atv
);
*
output
=
(
*
value_og
)
*
(
*
state_atv
);
...
@@ -52,7 +61,7 @@ class lstm {
...
@@ -52,7 +61,7 @@ class lstm {
__m256
*
value_fg
,
__m256
*
value_og
,
__m256
*
value_fg
,
__m256
*
value_og
,
__m256
*
prev_state
,
__m256
*
state
,
__m256
*
prev_state
,
__m256
*
state
,
__m256
*
state_atv
,
__m256
*
output
,
__m256
*
checkI
,
__m256
*
state_atv
,
__m256
*
output
,
__m256
*
checkI
,
__m256
*
checkF
,
__m256
*
checkO
,
__m256
*
checkF
,
__m256
*
checkO
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
...
@@ -65,6 +74,13 @@ class lstm {
...
@@ -65,6 +74,13 @@ class lstm {
active_gate
);
active_gate
);
*
state
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
value_in
,
*
value_ig
),
*
state
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
value_in
,
*
value_ig
),
_mm256_mul_ps
(
*
prev_state
,
*
value_fg
));
_mm256_mul_ps
(
*
prev_state
,
*
value_fg
));
if
(
*
cell_clip
>
0.0
f
)
{
__m256
min
=
_mm256_set1_ps
(
0.0
f
-
*
cell_clip
);
__m256
max
=
_mm256_set1_ps
(
*
cell_clip
);
*
state
=
_mm256_min_ps
(
max
,
*
state
);
*
state
=
_mm256_max_ps
(
min
,
*
state
);
}
*
value_og
=
activation
(
*
value_og
=
activation
(
_mm256_add_ps
(
*
value_og
,
_mm256_mul_ps
(
*
state
,
*
checkO
)),
active_gate
);
_mm256_add_ps
(
*
value_og
,
_mm256_mul_ps
(
*
state
,
*
checkO
)),
active_gate
);
*
state_atv
=
activation
(
*
state
,
active_state
);
*
state_atv
=
activation
(
*
state
,
active_state
);
...
@@ -86,15 +102,26 @@ class lstm {
...
@@ -86,15 +102,26 @@ class lstm {
T
*
prev_state
,
T
*
prev_state_grad
,
T
*
state
,
T
*
prev_state
,
T
*
prev_state_grad
,
T
*
state
,
T
*
state_grad
,
T
*
state_atv
,
T
*
output_grad
,
T
*
state_grad
,
T
*
state_atv
,
T
*
output_grad
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
checkIGrad
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
checkIGrad
,
T
*
checkFGrad
,
T
*
checkOGrad
,
T
*
checkFGrad
,
T
*
checkOGrad
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_state
)
{
*
grad_og
=
*
grad_og
=
activation
((
*
output_grad
)
*
(
*
state_atv
),
*
value_og
,
active_gate
);
activation
((
*
output_grad
)
*
(
*
state_atv
),
*
value_og
,
active_gate
);
if
(
*
cell_clip
>
0.0
f
)
{
if
(
*
state
>=
(
*
cell_clip
)
||
*
state
<=
(
0.0
f
-
(
*
cell_clip
)))
{
*
state_grad
=
0.0
f
;
}
else
{
*
state_grad
+=
activation
((
*
output_grad
)
*
(
*
value_og
),
*
state_atv
,
active_state
)
+
(
*
grad_og
)
*
(
*
checkO
);
}
}
else
{
*
state_grad
+=
*
state_grad
+=
activation
((
*
output_grad
)
*
(
*
value_og
),
*
state_atv
,
active_state
)
+
activation
((
*
output_grad
)
*
(
*
value_og
),
*
state_atv
,
active_state
)
+
(
*
grad_og
)
*
(
*
checkO
);
(
*
grad_og
)
*
(
*
checkO
);
}
*
grad_in
=
activation
((
*
state_grad
)
*
(
*
value_ig
),
*
value_in
,
active_node
);
*
grad_in
=
activation
((
*
state_grad
)
*
(
*
value_ig
),
*
value_in
,
active_node
);
*
grad_ig
=
activation
((
*
state_grad
)
*
(
*
value_in
),
*
value_ig
,
active_gate
);
*
grad_ig
=
activation
((
*
state_grad
)
*
(
*
value_in
),
*
value_ig
,
active_gate
);
*
grad_fg
=
*
grad_fg
=
...
@@ -117,15 +144,24 @@ class lstm {
...
@@ -117,15 +144,24 @@ class lstm {
__m256
*
prev_state
,
__m256
*
prev_state_grad
,
__m256
*
state
,
__m256
*
prev_state
,
__m256
*
prev_state_grad
,
__m256
*
state
,
__m256
*
state_grad
,
__m256
*
state_atv
,
__m256
*
output_grad
,
__m256
*
state_grad
,
__m256
*
state_atv
,
__m256
*
output_grad
,
__m256
*
checkI
,
__m256
*
checkF
,
__m256
*
checkO
,
__m256
*
checkIGrad
,
__m256
*
checkI
,
__m256
*
checkF
,
__m256
*
checkO
,
__m256
*
checkIGrad
,
__m256
*
checkFGrad
,
__m256
*
checkOGrad
,
ActivationType
active_node
,
__m256
*
checkFGrad
,
__m256
*
checkOGrad
,
T
*
cell_clip
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
*
grad_og
=
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
state_atv
),
*
value_og
,
*
grad_og
=
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
state_atv
),
*
value_og
,
active_gate
);
active_gate
);
if
(
*
cell_clip
>
0.0
f
)
{
T
*
state_
=
reinterpret_cast
<
T
*>
(
state
);
if
(
*
state_
>=
(
*
cell_clip
)
||
*
state_
<=
(
0.0
f
-
(
*
cell_clip
)))
{
*
state_grad
=
_mm256_set1_ps
(
0.0
f
);
}
else
{
*
state_grad
=
*
state_grad
=
_mm256_add_ps
(
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
value_og
),
_mm256_add_ps
(
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
value_og
),
*
state_atv
,
active_state
),
*
state_atv
,
active_state
),
*
state_grad
);
*
state_grad
);
*
state_grad
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
grad_og
,
*
checkO
),
*
state_grad
);
*
state_grad
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
grad_og
,
*
checkO
),
*
state_grad
);
}
}
*
grad_in
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_ig
),
*
value_in
,
*
grad_in
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_ig
),
*
value_in
,
active_node
);
active_node
);
*
grad_ig
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_in
),
*
value_ig
,
*
grad_ig
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_in
),
*
value_ig
,
...
...
paddle/fluid/operators/math/lstm_compute.cc
浏览文件 @
cf0511f2
...
@@ -24,12 +24,12 @@ template <class T>
...
@@ -24,12 +24,12 @@ template <class T>
struct
LstmUnitFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
struct
LstmUnitFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
const
detail
::
ActivationType
&
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
cpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame_size
,
detail
::
cpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame_size
,
cand_act
,
gate_act
,
cell_act
);
c
ell_clip
,
c
and_act
,
gate_act
,
cell_act
);
value
.
gate_value
+=
frame_size
*
4
;
value
.
gate_value
+=
frame_size
*
4
;
value
.
state_value
+=
frame_size
;
value
.
state_value
+=
frame_size
;
value
.
state_active_value
+=
frame_size
;
value
.
state_active_value
+=
frame_size
;
...
@@ -45,13 +45,14 @@ template <class T>
...
@@ -45,13 +45,14 @@ template <class T>
struct
LstmUnitGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
struct
LstmUnitGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
const
detail
::
ActivationType
&
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
cpu_lstm_backward
(
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
detail
::
cpu_lstm_backward
(
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame_size
,
cand_act
,
gate_act
,
cell_act
);
frame_size
,
cell_clip
,
cand_act
,
gate_act
,
cell_act
);
value
.
gate_value
+=
frame_size
*
4
;
value
.
gate_value
+=
frame_size
*
4
;
value
.
state_value
+=
frame_size
;
value
.
state_value
+=
frame_size
;
...
...
paddle/fluid/operators/math/lstm_compute.cu
浏览文件 @
cf0511f2
...
@@ -24,12 +24,12 @@ template <class T>
...
@@ -24,12 +24,12 @@ template <class T>
struct
LstmUnitFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
struct
LstmUnitFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
const
detail
::
ActivationType
&
cand_act
)
{
detail
::
gpu_lstm_forward
<
T
>
(
context
,
detail
::
forward
::
lstm
<
T
>
(),
value
,
detail
::
gpu_lstm_forward
<
T
>
(
context
,
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame_size
,
batch_size
,
c
and_act
,
gate
_act
,
frame_size
,
batch_size
,
c
ell_clip
,
cand
_act
,
cell_act
);
gate_act
,
cell_act
);
}
}
};
};
...
@@ -37,13 +37,13 @@ template <class T>
...
@@ -37,13 +37,13 @@ template <class T>
struct
LstmUnitGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
struct
LstmUnitGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
const
detail
::
ActivationType
&
cand_act
)
{
detail
::
gpu_lstm_backward
(
context
,
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
detail
::
gpu_lstm_backward
(
context
,
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
c
and_act
,
gate
_act
,
frame_size
,
batch_size
,
c
ell_clip
,
cand
_act
,
cell_act
);
gate_act
,
cell_act
);
}
}
};
};
...
...
paddle/fluid/operators/math/lstm_compute.h
浏览文件 @
cf0511f2
...
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
...
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
class
LstmUnitFunctor
{
class
LstmUnitFunctor
{
public:
public:
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
);
const
detail
::
ActivationType
&
cand_act
);
...
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
...
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
public:
public:
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
);
const
detail
::
ActivationType
&
cand_act
);
};
};
...
...
paddle/fluid/operators/
ngraph/ngraph_ops.h
→
paddle/fluid/operators/
math/sample_prob.cc
浏览文件 @
cf0511f2
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
9
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -12,28 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,28 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
/*
#include "paddle/fluid/operators/math/sample_prob.h"
* This file contains the list of the ngraph operators for Paddle.
*
* ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
namespace
paddle
{
namespace
operators
{
namespace
math
{
#include "ops/accuracy_op.h"
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
float
>;
#include "ops/activation_op.h"
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
double
>;
#include "ops/batch_norm_op.h"
#include "ops/binary_unary_op.h"
}
// namespace math
#include "ops/conv2d_op.h"
}
// namespace operators
#include "ops/cross_entropy_op.h"
}
// namespace paddle
#include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/momentum_op.h"
#include "ops/mul_op.h"
#include "ops/pool2d_op.h"
#include "ops/scale_op.h"
#include "ops/softmax_op.h"
#include "ops/sum_op.h"
#include "ops/top_k_op.h"
paddle/fluid/operators/math/sample_prob.cu
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/random.h>
#include <thrust/sort.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
__device__
T
gpu_adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
class
GPULogUniformSampler
{
public:
__device__
int64_t
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
;
__device__
float
Probability
(
int64_t
value
,
const
float
log_range
)
const
;
};
__device__
int64_t
GPULogUniformSampler
::
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
{
// Got Log Uniform distribution from uniform distribution by
// inverse_transform_sampling method
const
int64_t
value
=
static_cast
<
int64_t
>
(
exp
(
random
*
log_range
))
-
1
;
// Mathematically, value should be <= range_, but might not be due to some
// floating point roundoff, so we mod by range_.
return
value
%
range
;
}
__device__
float
GPULogUniformSampler
::
Probability
(
int64_t
value
,
const
float
log_range
)
const
{
// Given f(x) = 1/[(x+1) * log_range_]
// The value's probability is integral of f(x) from value to (value + 1)
return
(
log
((
value
+
2.0
)
/
(
value
+
1.0
)))
/
log_range
;
}
template
<
typename
T
>
__global__
void
SamplingCondidate
(
const
size_t
n
,
const
int
num_tries
,
const
int
range
,
const
float
log_range
,
const
int
num_true
,
const
std
::
size_t
num_samples
,
const
int64_t
*
label_data
,
int64_t
*
samples_data
,
T
*
probabilities_data
)
{
const
int
num_sampled_classes
=
num_true
+
num_samples
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
0
;
GPULogUniformSampler
sampler
;
for
(;
idx
<
n
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
col_idx
=
idx
%
num_sampled_classes
;
int
row_idx
=
idx
/
num_sampled_classes
;
if
(
col_idx
<
num_true
)
{
samples_data
[
idx
]
=
label_data
[
row_idx
*
num_true
+
col_idx
];
}
else
{
samples_data
[
idx
]
=
samples_data
[
col_idx
];
}
probabilities_data
[
idx
]
=
sampler
.
Probability
(
samples_data
[
idx
],
log_range
);
probabilities_data
[
idx
]
=
gpu_adjust_prob
(
probabilities_data
[
idx
],
num_samples
,
num_tries
);
}
}
template
<
typename
T
>
int
UniqSampler
(
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
int64_t
*
samples_data
)
{
// sample num_samles unique samples for an example, note that they are not
// all negative samples
std
::
unordered_set
<
int64_t
>
tmp_samples
;
tmp_samples
.
clear
();
int
num_tries
=
0
;
int
j
=
0
;
while
(
j
<
num_samples
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
samples_data
[
j
]
=
v
;
++
j
;
}
return
num_tries
;
}
template
<
typename
T
>
void
GPUSampleWithProb
<
T
>::
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
data
<
int64_t
>
();
T
*
probabilities_data
=
P
->
data
<
T
>
();
int
s_size
=
num_samples
;
framework
::
DDim
s_dim
{
s_size
};
Tensor
s
;
int64_t
*
s_data
=
s
.
mutable_data
<
int64_t
>
(
s_dim
,
platform
::
CPUPlace
());
math
::
LogUniformSampler
sampler
(
dict_size
,
seed
);
int
range
=
dict_size
;
float
log_range
=
log
(
range
+
1
);
int
num_tries
=
UniqSampler
<
T
>
(
sampler
,
num_samples
,
s_data
);
VLOG
(
1
)
<<
"num_tries: "
<<
num_tries
;
PADDLE_ENFORCE
(
cudaMemcpy
(
samples_data
+
num_true
,
s_data
,
sizeof
(
int64_t
)
*
num_samples
,
cudaMemcpyHostToDevice
));
int
threads
=
512
;
const
size_t
size
=
batch_size
*
num_sampled_classes
;
int
grid
=
(
batch_size
*
num_sampled_classes
+
threads
-
1
)
/
threads
;
SamplingCondidate
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
size
,
num_tries
,
range
,
log_range
,
num_true
,
num_samples
,
label_data
,
samples_data
,
probabilities_data
);
}
template
class
GPUSampleWithProb
<
float
>;
template
class
GPUSampleWithProb
<
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sample_prob.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
/* UNDERSTAND: utility function to adjust probability for unique sampling,
return whatever as it is if not using unique samping */
template
<
typename
T
>
static
T
adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
SampleWithProb
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
mutable_data
<
int64_t
>
(
ret_dim
,
context
.
GetPlace
());
T
*
probabilities_data
=
P
->
mutable_data
<
T
>
(
ret_dim
,
context
.
GetPlace
());
// temp sets for unique sampling
std
::
unordered_set
<
int64_t
>
tmp_samples
;
int
j
=
0
;
// column index
// add true labels, not that efficient
while
(
j
<
num_true
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
auto
v
=
label_data
[
i
*
num_true
+
j
];
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
sampler
.
Probability
(
v
);
}
++
j
;
}
// sample num_samles unique samples for an example, note that they are not
// all negative samples
tmp_samples
.
clear
();
int
num_tries
=
0
;
while
(
j
<
num_sampled_classes
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
auto
p
=
sampler
.
Probability
(
v
);
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
p
;
}
++
j
;
}
// compute Q(y|x), because of unique sampling, probabilities need to be
// adjusted
for
(
int
k
=
0
;
k
<
num_sampled_classes
;
++
k
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
k
;
probabilities_data
[
samples_index
]
=
adjust_prob
(
probabilities_data
[
samples_index
],
num_samples
,
num_tries
);
}
}
}
};
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
class
GPUSampleWithProb
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
);
};
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
...
@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
"Wrong layout/format set for Input x tensor"
);
"Wrong layout/format set for Input x tensor"
);
Functor
functor
;
Functor
functor
;
auto
attrs
=
functor
.
GetAttrs
();
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
functor
(
ctx
);
functor
(
ctx
);
}
}
};
};
...
@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
...
@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
"is_test attribute should be set to False in training phase."
);
"is_test attribute should be set to False in training phase."
);
Functor
functor
;
Functor
functor
;
auto
attrs
=
functor
.
GetAttrs
();
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
functor
(
ctx
);
functor
(
ctx
);
}
}
};
};
...
@@ -235,7 +225,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
...
@@ -235,7 +225,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
"Fail to find src_memory in device context"
);
"Fail to find src_memory in device context"
);
src_memory
->
set_data_handle
(
*
p_src_data
.
get
()
);
src_memory
->
set_data_handle
(
*
p_src_data
);
std
::
shared_ptr
<
memory
>
diff_src_memory
;
std
::
shared_ptr
<
memory
>
diff_src_memory
;
...
...
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
&&
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
);
input
->
format
()
!=
memory
::
format
::
format_undef
,
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
);
"Wrong layout/format set for Input tensor"
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
&&
filter
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Filter tensor"
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
||
input
->
dims
().
size
()
==
5
,
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
||
input
->
dims
().
size
()
==
5
,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"
);
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
||
filter
->
dims
().
size
()
==
5
,
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
||
filter
->
dims
().
size
()
==
5
,
...
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
primitive
>
pipeline
;
std
::
vector
<
primitive
>
pipeline
;
auto
src_format
=
input
->
format
();
// For convolution with groups we need to recreate primitive descriptor
// as Paddle tensor is not having group dims while mkldnn treats
// group as another dimensions
mkldnn
::
memory
::
primitive_desc
user_weights_mpd
=
filter
->
get_mkldnn_prim_desc
();
if
(
g
>
1
)
{
mkldnn
::
memory
::
format
weights_format
=
mkldnn
::
memory
::
format
weights_format
=
GetWeightsFormat
(
filter
->
format
(),
g
,
is_conv3d
);
GetWeightsFormat
(
filter
->
format
(),
g
,
is_conv3d
);
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
src_format
);
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
(
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
(
{
weights_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
weights_format
);
{
weights_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
weights_format
);
user_weights_mpd
=
mkldnn
::
memory
::
primitive_desc
(
user_weights_md
,
mkldnn_engine
);
}
/* create memory descriptor for convolution without specified format
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* ('any') which lets a primitive (convolution in this case) choose
...
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
chosen_memory_format
=
auto
chosen_memory_format
=
platform
::
data_format_to_memory_format
(
data_format
);
platform
::
data_format_to_memory_format
(
data_format
);
weights_format
=
mkldnn
::
memory
::
format
::
any
;
mkldnn
::
memory
::
format
weights_format
=
mkldnn
::
memory
::
format
::
any
;
// Check the format for user's special output
// Check the format for user's special output
if
(
chosen_memory_format
!=
mkldnn
::
memory
::
format
::
any
)
{
if
(
chosen_memory_format
!=
mkldnn
::
memory
::
format
::
any
)
{
if
(
is_conv3d
)
{
if
(
is_conv3d
)
{
...
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
handler
.
AcquireSrcMemory
(
user_src_md
,
to_void_cast
<
T
>
(
input_data
));
input
->
get_mkldnn_prim_desc
()
,
to_void_cast
<
T
>
(
input_data
));
auto
user_weights_memory_p
=
handler
.
AcquireWeightsMemory
(
auto
user_weights_memory_p
=
handler
.
AcquireWeightsMemory
(
user_weights_md
,
to_void_cast
<
T
>
(
filter_data
));
user_weights_m
p
d
,
to_void_cast
<
T
>
(
filter_data
));
// create reorder primitive if the input format is not the preferred one
// create reorder primitive if the input format is not the preferred one
auto
src_memory_p
=
auto
src_memory_p
=
...
@@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
conv_p
);
pipeline
.
push_back
(
*
conv_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
auto
dst_mpd
=
dst_memory_p
->
get_primitive_desc
(
);
output
->
set_
format
(
GetMKLDNNFormat
(
*
dst_memory_p
)
);
output
->
set_
mkldnn_prim_desc
(
dst_mpd
);
}
}
void
ComputeINT8
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
void
ComputeINT8
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
...
@@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed
// push primitive to stream and wait until it's executed
pipeline
.
push_back
(
*
conv_bwd_weights_p
);
pipeline
.
push_back
(
*
conv_bwd_weights_p
);
filter_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
auto
filter_grad_mpd
=
diff_weights_memory_p
->
get_primitive_desc
(
);
filter_grad
->
set_
format
(
GetMKLDNNFormat
(
*
diff_weights_memory_p
)
);
filter_grad
->
set_
mkldnn_prim_desc
(
filter_grad_mpd
);
}
}
if
(
input_grad
)
{
if
(
input_grad
)
{
...
...
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor
->
set_layout
(
DataLayout
::
kMKLDNN
);
// TODO(jczaja): Remove this hack after checking performance on block layout
tensor
->
set_format
(
mkldnn
::
memory
::
format
::
oihw
);
auto
tensor_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
tensor
->
dims
()),
mkldnn
::
memory
::
format
::
oihw
);
tensor
->
set_mkldnn_prim_desc
(
tensor_mem_pd
);
}
}
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -198,7 +198,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -198,7 +198,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
}
// push primitive to stream and wait until it's executed
// push primitive to stream and wait until it's executed
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
(
pool_p
.
get
())
};
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
pool_p
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
@@ -367,8 +367,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -367,8 +367,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx
.
SetBlob
(
key_pool_diff_dst_mem_p
,
diff_dst_memory
);
dev_ctx
.
SetBlob
(
key_pool_diff_dst_mem_p
,
diff_dst_memory
);
pool_bwd_p
=
std
::
make_shared
<
pooling_backward
>
(
pool_bwd_p
=
std
::
make_shared
<
pooling_backward
>
(
pool_bwd_pd
,
*
(
diff_dst_memory
.
get
()),
*
workspace_memory
,
pool_bwd_pd
,
*
diff_dst_memory
,
*
workspace_memory
,
*
diff_src_memory
);
*
(
diff_src_memory
));
dev_ctx
.
SetBlob
(
key_pool_bwd_p
,
pool_bwd_p
);
dev_ctx
.
SetBlob
(
key_pool_bwd_p
,
pool_bwd_p
);
}
else
{
}
else
{
...
@@ -404,7 +403,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -404,7 +403,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
if
(
is_diff_dst_reordered
)
{
if
(
is_diff_dst_reordered
)
{
pipeline
.
push_back
(
reorder_diff_dst
);
pipeline
.
push_back
(
reorder_diff_dst
);
}
}
pipeline
.
push_back
(
*
(
pool_bwd_p
.
get
())
);
pipeline
.
push_back
(
*
pool_bwd_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
in_x_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
in_x_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
...
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax primitive in device context"
);
"Fail to find softmax primitive in device context"
);
if
(
softmax_p
==
nullptr
)
{
if
(
softmax_p
==
nullptr
)
{
softmax_p
=
std
::
make_shared
<
mkldnn
::
softmax_forward
>
(
softmax_p
=
std
::
make_shared
<
mkldnn
::
softmax_forward
>
(
*
(
softmax_pd_
.
get
()),
*
softmax_pd_
,
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
(
static_cast
<
mkldnn
::
memory
*>
(
dst_memory_p
.
get
())));
*
(
static_cast
<
mkldnn
::
memory
*>
(
dst_memory_p
.
get
())));
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_p
);
}
else
{
}
else
{
...
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax backward primitive in device context"
);
"Fail to find softmax backward primitive in device context"
);
if
(
softmax_bwd_p
==
nullptr
)
{
if
(
softmax_bwd_p
==
nullptr
)
{
softmax_bwd_p
=
std
::
make_shared
<
mkldnn
::
softmax_backward
>
(
softmax_bwd_p
=
std
::
make_shared
<
mkldnn
::
softmax_backward
>
(
*
softmax_bwd_pd_
,
*
(
dst_memory_p
.
get
()),
*
(
diff_dst_memory_p
.
get
())
,
*
softmax_bwd_pd_
,
*
dst_memory_p
,
*
diff_dst_memory_p
,
*
(
diff_src_memory_p
.
get
())
);
*
diff_src_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_bwd_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_bwd_p
);
}
else
{
}
else
{
is_reusing_
=
true
;
is_reusing_
=
true
;
...
...
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -160,7 +160,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -160,7 +160,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
get_selected_row
=
[
&
](
size_t
i
)
->
const
SelectedRows
&
{
auto
get_selected_row
=
[
&
](
size_t
i
)
->
const
SelectedRows
&
{
if
(
i
==
0
&&
in0
)
{
if
(
i
==
0
&&
in0
)
{
return
*
in0
.
get
()
;
return
*
in0
;
}
else
{
}
else
{
return
in_vars
[
i
]
->
Get
<
SelectedRows
>
();
return
in_vars
[
i
]
->
Get
<
SelectedRows
>
();
}
}
...
...
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
浏览文件 @
cf0511f2
...
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine
,
key
);
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
input
->
format
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
input
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
auto
transpose_dst_memory_p
=
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
output
,
ctx
.
GetPlace
());
handler
.
AcquireDstMemory
(
output
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
output_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
output
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
output
->
set_mkldnn_prim_desc
(
output_mem_pd
);
}
}
};
};
...
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
platform
::
TransposeMKLDNNHandler
handler
(
nchw_tz
,
reversed_axis
,
dev_ctx
,
platform
::
TransposeMKLDNNHandler
handler
(
nchw_tz
,
reversed_axis
,
dev_ctx
,
mkldnn_engine
,
key
);
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
auto
transpose_src_memory_p
=
out_grad
->
format
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
handler
.
AcquireSrcMemory
(
out_grad
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
auto
transpose_dst_memory_p
=
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
x_grad
,
ctx
.
GetPlace
());
handler
.
AcquireDstMemory
(
x_grad
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
x_grad_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
x_grad
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
x_grad
->
set_mkldnn_prim_desc
(
x_grad_mem_pd
);
}
}
};
};
...
...
paddle/fluid/operators/ngraph/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
...
@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto
)
cc_library
(
ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto
)
op_library
(
ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context
)
op_library
(
ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context
)
add_subdirectory
(
ops
)
endif
()
endif
()
paddle/fluid/operators/ngraph/ngraph_bridge.cc
浏览文件 @
cf0511f2
...
@@ -19,50 +19,21 @@ limitations under the License. */
...
@@ -19,50 +19,21 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
NG_OPS
=
paddle
::
operators
::
ngraphs
;
bool
NgraphBridge
::
isRegister
(
const
std
::
string
&
str
)
{
std
::
map
<
std
::
string
,
return
ops
::
NgraphSingleton
::
Lookup
(
str
);
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
}
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
{
"accuracy"
,
NG_OPS
::
BuildAccuracyNode
},
{
"conv2d"
,
NG_OPS
::
BuildConv2dNode
},
{
"conv2d_grad"
,
NG_OPS
::
BuildConv2dGradNode
},
{
"batch_norm"
,
NG_OPS
::
BuildBatchNormNode
},
{
"batch_norm_grad"
,
NG_OPS
::
BuildBatchNormGradNode
},
{
"cross_entropy"
,
NG_OPS
::
BuildCrossEntropyNode
},
{
"cross_entropy_grad"
,
NG_OPS
::
BuildCrossEntropyGradNode
},
{
"elementwise_add"
,
NG_OPS
::
BuildElementwiseAddNode
},
{
"elementwise_add_grad"
,
NG_OPS
::
BuildElementwiseAddGradNode
},
{
"fill_constant"
,
NG_OPS
::
BuildFillConstantNode
},
{
"mean"
,
NG_OPS
::
BuildMeanNode
},
{
"mean_grad"
,
NG_OPS
::
BuildMeanGradNode
},
{
"momentum"
,
NG_OPS
::
BuildMomentumNode
},
{
"mul"
,
NG_OPS
::
BuildMulNode
},
{
"mul_grad"
,
NG_OPS
::
BuildMulGradNode
},
{
"pool2d"
,
NG_OPS
::
BuildPool2dNode
},
{
"pool2d_grad"
,
NG_OPS
::
BuildPool2dGradNode
},
{
"softmax"
,
NG_OPS
::
BuildSoftmaxNode
},
{
"softmax_grad"
,
NG_OPS
::
BuildSoftmaxGradNode
},
{
"scale"
,
NG_OPS
::
BuildScaleNode
},
{
"sigmoid"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Sigmoid
>
},
{
"sum"
,
NG_OPS
::
BuildSumNode
},
{
"relu"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"relu_grad"
,
NG_OPS
::
BuildReluGradNode
},
{
"tanh"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"tanh_grad"
,
NG_OPS
::
BuildTanhGradNode
},
{
"top_k"
,
NG_OPS
::
BuildTopKNode
}};
void
NgraphBridge
::
BuildNgNode
(
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
)
{
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
)
{
auto
&
op_type
=
op
->
Type
();
auto
&
op_type
=
op
->
Type
();
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map_
);
ops
::
NgraphSingleton
::
BuildNode
(
ngb_node_map_
,
op
,
op_type
);
}
}
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/ngraph/ngraph_bridge.h
浏览文件 @
cf0511f2
...
@@ -28,13 +28,6 @@ namespace operators {
...
@@ -28,13 +28,6 @@ namespace operators {
class
NgraphBridge
{
class
NgraphBridge
{
public:
public:
static
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NG_NODE_MAP
;
explicit
NgraphBridge
(
explicit
NgraphBridge
(
std
::
shared_ptr
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
...
@@ -43,6 +36,8 @@ class NgraphBridge {
...
@@ -43,6 +36,8 @@ class NgraphBridge {
void
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
);
void
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
);
static
bool
isRegister
(
const
std
::
string
&
str
);
private:
private:
std
::
shared_ptr
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
...
...
paddle/fluid/operators/ngraph/ngraph_engine.cc
浏览文件 @
cf0511f2
...
@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
...
@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int
pivot
=
left
;
int
pivot
=
left
;
while
(
pivot
<
right
)
{
while
(
pivot
<
right
)
{
auto
op_type
=
ops
.
at
(
pivot
)
->
Type
();
auto
op_type
=
ops
.
at
(
pivot
)
->
Type
();
if
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
op_type
)
==
if
(
NgraphBridge
::
isRegister
(
op_type
))
{
NgraphBridge
::
NG_NODE_MAP
.
end
())
{
++
pivot
;
++
pivot
;
}
else
{
}
else
{
int
start
=
pivot
,
end
=
start
;
int
start
=
pivot
,
end
=
start
;
while
(
pivot
<
right
&&
while
(
pivot
<
right
&&
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
ops
.
at
(
pivot
)
->
Type
())
!=
(
!
NgraphBridge
::
isRegister
(
ops
.
at
(
pivot
)
->
Type
())))
{
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
++
pivot
;
++
pivot
;
++
end
;
++
end
;
}
}
...
...
paddle/fluid/operators/ngraph/ops/CMakeLists.txt
0 → 100644
浏览文件 @
cf0511f2
file
(
GLOB LIST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*.h"
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/operators/ngraph/ngraph_ops.h
)
file
(
APPEND
${
pass_file
}
"
\#
pragma once
\n
"
)
file
(
WRITE
${
pass_file
}
"// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
foreach
(
OPS_NAME
${
LIST_OPS
}
)
file
(
APPEND
${
pass_file
}
"
\#
include
\"
paddle/fluid/operators/ngraph/ops/
${
OPS_NAME
}
\"\n
"
)
endforeach
(
OPS_NAME
)
paddle/fluid/operators/ngraph/ops/accuracy_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -63,3 +64,5 @@ void BuildAccuracyNode(
...
@@ -63,3 +64,5 @@ void BuildAccuracyNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
accuracy
,
BuildAccuracyNode
);
paddle/fluid/operators/ngraph/ops/activation_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -50,3 +51,6 @@ void BuildTanhGradNode(
...
@@ -50,3 +51,6 @@ void BuildTanhGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
relu_grad
,
BuildReluGradNode
);
REGISTER_NG_OP
(
than_grad
,
BuildTanhGradNode
);
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
浏览文件 @
cf0511f2
...
@@ -20,6 +20,7 @@ limitations under the License. */
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -155,3 +156,6 @@ void BuildBatchNormGradNode(
...
@@ -155,3 +156,6 @@ void BuildBatchNormGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
batch_norm
,
BuildBatchNormNode
);
REGISTER_NG_OP
(
batch_norm_grad
,
BuildBatchNormGradNode
);
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
浏览文件 @
cf0511f2
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -47,3 +48,7 @@ static void BuildUnaryNode(
...
@@ -47,3 +48,7 @@ static void BuildUnaryNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
relu
,
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
);
REGISTER_NG_OP
(
tanh
,
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
);
REGISTER_NG_OP
(
sigmoid
,
BuildUnaryNode
<
ngraph
::
op
::
Sigmoid
>
);
paddle/fluid/operators/ngraph/ops/conv2d_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
...
@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
conv2d
,
BuildConv2dNode
);
REGISTER_NG_OP
(
conv2d_grad
,
BuildConv2dGradNode
);
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
浏览文件 @
cf0511f2
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode(
...
@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
cross_entropy
,
BuildCrossEntropyNode
);
REGISTER_NG_OP
(
cross_entropy_grad
,
BuildCrossEntropyGradNode
);
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
浏览文件 @
cf0511f2
...
@@ -19,6 +19,7 @@ limitations under the License. */
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
...
@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
elementwise_add
,
BuildElementwiseAddNode
);
REGISTER_NG_OP
(
elementwise_add_grad
,
BuildElementwiseAddGradNode
);
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -55,3 +56,5 @@ void BuildFillConstantNode(
...
@@ -55,3 +56,5 @@ void BuildFillConstantNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
fill_constant
,
BuildFillConstantNode
);
paddle/fluid/operators/ngraph/ops/mean_op.h
浏览文件 @
cf0511f2
...
@@ -19,6 +19,7 @@ limitations under the License. */
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -64,3 +65,6 @@ void BuildMeanGradNode(
...
@@ -64,3 +65,6 @@ void BuildMeanGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
mean
,
BuildMeanNode
);
REGISTER_NG_OP
(
mean_grad
,
BuildMeanGradNode
);
paddle/fluid/operators/ngraph/ops/momentum_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -99,3 +100,5 @@ void BuildMomentumNode(
...
@@ -99,3 +100,5 @@ void BuildMomentumNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
momentum
,
BuildMomentumNode
);
paddle/fluid/operators/ngraph/ops/mul_op.h
浏览文件 @
cf0511f2
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -130,3 +131,6 @@ static void BuildMulGradNode(
...
@@ -130,3 +131,6 @@ static void BuildMulGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
mul
,
BuildMulNode
);
REGISTER_NG_OP
(
mul_grad
,
BuildMulGradNode
);
paddle/fluid/operators/ngraph/ops/op_bridge.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include "ngraph/node.hpp"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
ops
{
class
NgraphSingleton
{
NgraphSingleton
()
=
default
;
NgraphSingleton
(
NgraphSingleton
const
&
)
=
delete
;
void
operator
=
(
NgraphSingleton
const
)
=
delete
;
~
NgraphSingleton
()
=
default
;
static
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
ng_node_maps_
;
public:
template
<
typename
TF
>
static
void
Register
(
TF
&&
tf
,
const
std
::
string
&
name
)
{
ng_node_maps_
[
name
]
=
tf
;
}
static
bool
Lookup
(
const
std
::
string
&
name
)
{
auto
it
=
ng_node_maps_
.
find
(
name
);
if
(
it
==
ng_node_maps_
.
end
())
{
return
true
;
}
return
false
;
}
static
void
BuildNode
(
const
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>&
ng_maps
,
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
,
const
std
::
string
&
name
)
{
ng_node_maps_
[
name
](
op
,
ng_maps
);
}
};
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphSingleton
::
ng_node_maps_
;
}
// namespace ops
}
// namespace operators
}
// namespace paddle
#define REGISTER_NG_OP(op_type__, Converter__) \
struct ng_##op_type__##_converter { \
ng_##op_type__##_converter() { \
paddle::operators::ops::NgraphSingleton::Register( \
paddle::operators::ngraphs::Converter__, #op_type__); \
} \
}; \
ng_##op_type__##_converter ng_##op_type__##_converter__;
paddle/fluid/operators/ngraph/ops/pool2d_op.h
浏览文件 @
cf0511f2
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
...
@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
pool2d
,
BuildPool2dNode
);
REGISTER_NG_OP
(
pool2d_grad
,
BuildPool2dGradNode
);
paddle/fluid/operators/ngraph/ops/scale_op.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -37,3 +38,5 @@ void BuildScaleNode(
...
@@ -37,3 +38,5 @@ void BuildScaleNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
scale
,
BuildScaleNode
);
paddle/fluid/operators/ngraph/ops/softmax_op.h
浏览文件 @
cf0511f2
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
...
@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
softmax
,
BuildSoftmaxNode
);
REGISTER_NG_OP
(
softmax_grad
,
BuildSoftmaxGradNode
);
paddle/fluid/operators/ngraph/ops/top_k_op.h
浏览文件 @
cf0511f2
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -42,3 +43,5 @@ void BuildTopKNode(
...
@@ -42,3 +43,5 @@ void BuildTopKNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
top_k
,
BuildTopKNode
);
paddle/fluid/operators/pool_op.cc
浏览文件 @
cf0511f2
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."
);
// TODO(Chengduo): Add checker.
"be ignored."
);
// TODO(Chengduo): Add checker.
// (Currently,
// (Currently,
// TypedAttrChecker don't support vector type.)
// TypedAttrChecker don't support vector type.)
AddAttr
<
bool
>
(
"global_pooling"
,
AddAttr
<
bool
>
(
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings will be ignored."
)
"If global_pooling = true, kernel
size and paddings will be ignored."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1, 1}), strides(height, "
"(vector<int>, default {1, 1}), strides(height, "
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings"
,
"paddings"
,
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator."
"operator."
"If global_pooling = true, paddings and ksize will be ignored."
)
"If global_pooling = true, paddings and k
ernel
size will be ignored."
)
.
SetDefault
({
0
,
0
});
.
SetDefault
({
0
,
0
});
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"exclusive"
,
"exclusive"
,
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"ceil_mode"
,
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
"the floor function will be used."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
...
@@ -262,28 +263,37 @@ Example:
...
@@ -262,28 +263,37 @@ Example:
For exclusive = false:
For exclusive = false:
$$
$$
hstart = i * strides[0] - paddings[0]
hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0]
hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1]
wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1]
wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
$$
For exclusive = true:
For exclusive = true:
$$
$$
hstart = max(0, i * strides[0] - paddings[0])
hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0])
hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1])
wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1])
wend = min(W, wstart + ksize[1])
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
$$
For adaptive = true:
$$
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
$$
)DOC"
);
)DOC"
);
}
}
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"global_pooling"
,
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings wille
be ignored."
)
"If global_pooling = true, k
ernel size and paddings will
be ignored."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"strides"
,
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"ceil_mode"
,
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
"the floor function will be used."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
...
@@ -393,45 +403,65 @@ Example:
...
@@ -393,45 +403,65 @@ Example:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false:
For ceil_mode = false:
$$
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
$$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
$$
For ceil_mode = true:
For ceil_mode = true:
$$
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
$$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
$$
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = false:
For exclusive = false:
$$
$$
dstart = i * strides[0] - paddings[0]
dstart = i * strides[0] - paddings[0]
$$
$$
dend = dstart + ksize[0]
dend = dstart + ksize[0]
$$
$$
hstart = j * strides[1] - paddings[1]
hstart = j * strides[1] - paddings[1]
$$
$$
hend = hstart + ksize[1]
hend = hstart + ksize[1]
$$
$$
wstart = k * strides[2] - paddings[2]
wstart = k * strides[2] - paddings[2]
$$
$$
wend = wstart + ksize[2]
wend = wstart + ksize[2]
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
$$
For exclusive = true:
For exclusive = true:
$$
$$
dstart = max(0, i * strides[0] - paddings[0])
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
dend = min(D, dstart + ksize[0])
hstart = max(0, j * strides[1] - paddings[1])
$$
$$
hend = min(H, hstart + ksize[1])
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
wend = min(W, wstart + ksize[2])
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
$$
For adaptive = true:
$$
$$
dstart = floor(i * D_{in} / D_{out})
dend = ceil((i + 1) * D_{in} / D_{out})
hstart = floor(j * H_{in} / H_{out})
hend = ceil((j + 1) * H_{in} / H_{out})
wstart = floor(k * W_{in} / W_{out})
wend = ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
$$
...
...
paddle/fluid/operators/sample_logits_op.cc
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/fluid/operators/math/sample_prob.h"
namespace
paddle
{
namespace
operators
{
class
SampleLogitsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Logits"
,
"(Tensor, default: Tensor<float>), The unscaled log probabilities "
"which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number."
);
AddInput
(
"Labels"
,
"(Tensor) The ground truth which is a 2-D tensor. Labels is a "
"Tensor<int64> with shape [N x NT], where NT is the number of"
"true labels for each example."
);
AddInput
(
"CustomizedSamples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S],"
" where N is the batch size, NT is the number of true labels "
"and S is the number of negtive sample for each example."
"The first NT elements of each row should be the same with true "
"labels, "
"followed by S custom negtive samples. This tensor"
"is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddInput
(
"CustomizedProbabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The tensor has the same shape with CustomSamples,"
"and each element represents probability of element in CustomSamples. "
"This "
"tensor is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddOutput
(
"Samples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S]."
"The outputs value of sampler, including NT true lables and S "
"negetive samples "
"for each example. This will be used in"
"backward calculation."
)
.
AsIntermediate
();
AddOutput
(
"Probabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The probabilites of sampled positive and negtive labels."
)
.
AsIntermediate
();
AddOutput
(
"SampledLogits"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape"
"[N, NT + S]. The outputs value of sampled logits, which will be"
"used in backward propagation."
)
.
AsIntermediate
();
AddOutput
(
"SampledLabels"
,
"(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
"with shape [N, NT]. The tonsor contains hard labels as input to "
" softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
" of Sampels are positive lables."
);
AddAttr
<
bool
>
(
"use_customized_samples"
,
"An indicator whether to use customized samples with probabilities, if "
"True"
"the operator will use customized samples and customized probabilities"
"otherwise, the operator will generate them by itself."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"uniq"
,
"An indicator whether to sample non-repetitive negtive labels, if True"
"the operator will sample negtive labels without replacement."
"Otherwise, the operator will sample negtive labels with replacement."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"remove_accidental_hits"
,
"An indicator whether to remove accidental hits when samples hits true"
"labels, the removal is implemented by subtracting the corresponding"
"logits by float_max to subpress their softmax to be zero."
)
.
SetDefault
(
true
);
AddAttr
<
int
>
(
"num_samples"
,
"The number of negative samples."
);
AddAttr
<
int
>
(
"seed"
,
"Random seed for generating samples"
).
SetDefault
(
0
);
AddComment
(
R"DOC(
"""
Computes sampled output training logits and labels suitable for implementing
sampled softmax.
"""
)DOC"
);
}
};
class
SampleLogitsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Samples"
),
"Output(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Probabilities"
),
"Output(Probabilities) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLogits"
),
"Output(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLabels"
),
"Output(SampledLabels) should be not null."
);
auto
logits_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
labels_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
logits_dims
.
size
(),
2UL
,
"The logits of softmax_with_cross_entropy should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
labels_dims
.
size
(),
2UL
,
"The labels should be a 2-D tensor."
);
const
int
num_samples
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_samples"
);
const
int
num_sampled_classes
=
labels_dims
[
1
]
+
num_samples
;
ctx
->
SetOutputDim
(
"Samples"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"Probabilities"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLogits"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLabels"
,
{
logits_dims
[
0
],
labels_dims
[
1
]});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Logits"
));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: InferShape for Grad
class
SampleLogitsOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Samples"
),
"Input(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SampledLogits"
),
"Input(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"SampledLogits"
)),
"Input(SampledLogits@Grad) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Logits"
)),
"Output(Logits@Grad) should be not null."
);
auto
logit_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
label_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
2UL
,
"The label should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
logit_dims
.
size
(),
2UL
,
"The logits should be a 2-D tensor."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Logits"
),
ctx
->
GetInputDim
(
"Logits"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
framework
::
GradVarName
(
"SampledLogits"
)));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: what's the rule for making a GradMaker TODO
class
SampleLogitsGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
grad_op
=
new
framework
::
OpDesc
();
grad_op
->
SetType
(
"sample_logits_grad"
);
grad_op
->
SetInput
(
"Logits"
,
Input
(
"Logits"
));
grad_op
->
SetInput
(
"Labels"
,
Input
(
"Labels"
));
grad_op
->
SetInput
(
"Samples"
,
Output
(
"Samples"
));
grad_op
->
SetInput
(
"SampledLogits"
,
Output
(
"SampledLogits"
));
grad_op
->
SetInput
(
framework
::
GradVarName
(
"SampledLogits"
),
OutputGrad
(
"SampledLogits"
));
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
InputGrad
(
"Logits"
));
grad_op
->
SetAttrMap
(
Attrs
());
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
grad_op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sample_logits
,
ops
::
SampleLogitsOp
,
ops
::
SampleLogitsOpMaker
,
ops
::
SampleLogitsGradMaker
);
REGISTER_OPERATOR
(
sample_logits_grad
,
ops
::
SampleLogitsOpGrad
);
REGISTER_OP_CPU_KERNEL
(
sample_logits
,
ops
::
SampleLogitsKernel
<
float
>
,
ops
::
SampleLogitsKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradKernel
<
float
>
,
ops
::
SampleLogitsGradKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.cu
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
namespace
paddle
{
namespace
operators
{
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
__global__
void
GPUTakeAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
const
T
*
p_array
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
auto
array_index
=
p_index
[
idx
];
p_value
[
idx
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
__global__
void
GPUPutAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
T
*
p_array
,
const
int64_t
*
p_index
,
const
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
// size == batch_size
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
;
for
(
int
j
=
0
;
j
<
idx_slice_size
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
idx_slice_size
+
j
];
}
}
}
// UNDERSTAND: set label as 0,1,...,num_true-1
template
<
typename
T
>
__global__
void
GPUSetLabel
(
size_t
size
,
const
int
num_true
,
int64_t
*
p_array
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
p_array
[
idx
]
=
idx
%
num_true
;
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
__global__
void
gpu_compute_remove_accidental_hits
(
const
int
size
,
const
int
num_true
,
const
int
idx_slice_size
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
if
(
idx
%
idx_slice_size
<
num_true
)
continue
;
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
const
auto
true_idx
=
i
*
idx_slice_size
+
j
;
if
(
p_index
[
true_idx
]
==
p_index
[
idx
])
{
p_value
[
idx
]
-=
1e20
;
break
;
}
}
}
}
template
<
typename
T
>
class
SampleLogitsCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
VLOG
(
3
)
<<
"Enter SampleLogitsCUDAKernel"
;
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
uniq
=
context
.
Attr
<
bool
>
(
"uniq"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
cuda_device_context
();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
sampled_logits
,
static_cast
<
T
>
(
0
));
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
int
threads
=
512
;
size_t
size
=
batch_size
*
num_true
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUSetLabel
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
sampled_labels_data
);
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
GPUSampleWithProb
<
T
>
();
sampler_with_prob
(
context
.
cuda_device_context
(),
seed
,
num_classes
,
uniq
,
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
const
auto
num_take
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
const
T
*
p_array
=
logits
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
T
*
p_value
=
sampled_logits
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
size
=
batch_size
*
num_take
;
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUTakeAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
if
(
remove_accidental_hits
)
{
const
size_t
size
=
batch_size
*
(
num_true
+
num_samples
);
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
gpu_compute_remove_accidental_hits
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
idx_slice_size
,
p_index
,
p_value
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
cuda_device_context
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
const
auto
batch_size
=
samples
->
dims
()[
0
];
const
auto
num_put
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits_grad
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
T
*
p_array
=
logits_grad
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
const
T
*
p_value
=
sampled_logits_grad
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
int
threads
=
128
;
const
size_t
size
=
batch_size
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUPutAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sample_logits
,
ops
::
SampleLogitsCUDAKernel
<
float
>
,
ops
::
SampleLogitsCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradCUDAKernel
<
float
>
,
ops
::
SampleLogitsGradCUDAKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
TolerableValue
{
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
PADDLE_ASSERT
(
std
::
is_floating_point
<
T
>::
value
);
const
T
kApproInf
=
1e20
;
if
(
x
==
INFINITY
)
return
kApproInf
;
if
(
x
==
-
INFINITY
)
return
-
kApproInf
;
return
x
;
}
};
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
static
void
CPUTakeAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
array
,
const
framework
::
Tensor
&
index
,
framework
::
Tensor
*
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
.
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
.
dims
()[
0
]
&&
index
.
dims
()
==
value
->
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_take
=
index
.
dims
()[
1
];
const
auto
array_dims
=
array
.
dims
();
const
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
const
T
*
p_array
=
array
.
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
T
*
p_value
=
value
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_take
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_value
[
i
*
value_slice_size
+
j
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
static
void
CPUPutAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
array
,
const
framework
::
Tensor
&
index
,
const
framework
::
Tensor
&
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
->
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
->
dims
()[
0
]
&&
index
.
dims
()
==
value
.
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_put
=
index
.
dims
()[
1
];
auto
array_dims
=
array
->
dims
();
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
T
*
p_array
=
array
->
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
const
T
*
p_value
=
value
.
data
<
T
>
();
// slice sizes
const
auto
array_slice_size
=
array_dims
[
1
];
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_put
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
value_slice_size
+
j
];
}
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
static
void
compute_remove_accidental_hits
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
sampled_logits
,
const
framework
::
Tensor
&
samples
,
const
int
num_true
)
{
const
auto
batch_size
=
sampled_logits
->
dims
()[
0
];
const
auto
num_sampled_classes
=
sampled_logits
->
dims
()[
1
];
T
*
sampled_logits_data
=
sampled_logits
->
data
<
T
>
();
const
auto
samples_data
=
samples
.
data
<
int64_t
>
();
std
::
unordered_set
<
int64_t
>
tmp_true_labels
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
tmp_true_labels
.
clear
();
tmp_true_labels
.
insert
(
samples_data
+
i
*
num_sampled_classes
,
samples_data
+
i
*
num_sampled_classes
+
num_true
);
for
(
int
j
=
num_true
;
j
<
num_sampled_classes
;
++
j
)
{
const
auto
idx
=
i
*
num_sampled_classes
+
j
;
if
(
tmp_true_labels
.
find
(
samples_data
[
idx
])
!=
tmp_true_labels
.
end
())
sampled_logits_data
[
idx
]
-=
1e20
;
}
}
}
template
<
typename
T
>
class
SampleLogitsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
context
.
GetPlace
()),
"This kernel only runs on CPU."
);
VLOG
(
3
)
<<
"Enter SampleLogitsKernel"
;
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
sampled_labels_data
[
i
*
num_true
+
j
]
=
j
;
}
}
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
SampleWithProb
<
platform
::
CPUDeviceContext
,
T
>
();
sampler_with_prob
(
dev_ctx
,
math
::
LogUniformSampler
(
num_classes
,
seed
),
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
CPUTakeAlongD1
<
T
>
(
dev_ctx
,
*
logits
,
*
samples
,
sampled_logits
);
if
(
remove_accidental_hits
)
{
compute_remove_accidental_hits
<
T
>
(
dev_ctx
,
sampled_logits
,
*
samples
,
num_true
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
CPUPutAlongD1
<
T
>
(
dev_ctx
,
logits_grad
,
*
samples
,
*
sampled_logits_grad
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
cf0511f2
...
@@ -89,9 +89,9 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
...
@@ -89,9 +89,9 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
context device_tracer
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
tracer gpu_info enforce
)
else
()
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_
context device_tracer
)
cc_library
(
profiler SRCS profiler.cc DEPS device_
tracer enforce
)
endif
()
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
cf0511f2
...
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
...
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
int
tid
=
platform
::
get_cur_thread_id
();
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread
// Find KeyBlob for current thread
auto
map_it
=
pMap
->
find
(
tid
);
auto
map_it
=
pMap
->
find
(
tid
);
...
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
...
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
int
tid
=
platform
::
get_cur_thread_id
();
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread firstly
// Find KeyBlob for current thread firstly
auto
map_it
=
pMap
->
find
(
tid
);
auto
map_it
=
pMap
->
find
(
tid
);
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
cf0511f2
...
@@ -136,7 +136,7 @@ void EnableActivity() {
...
@@ -136,7 +136,7 @@ void EnableActivity() {
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
// We don't track these activities for now.
// We don't track these activities for now.
//
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
...
@@ -155,7 +155,7 @@ void DisableActivity() {
...
@@ -155,7 +155,7 @@ void DisableActivity() {
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
//
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
...
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
...
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy
->
correlationId
,
memcpy
->
bytes
);
memcpy
->
correlationId
,
memcpy
->
bytes
);
break
;
break
;
}
}
case
CUPTI_ACTIVITY_KIND_MEMSET
:
{
auto
*
memset
=
reinterpret_cast
<
const
CUpti_ActivityMemset
*>
(
record
);
tracer
->
AddKernelRecords
(
"MEMSET"
,
memset
->
start
,
memset
->
end
,
memset
->
deviceId
,
memset
->
streamId
,
memset
->
correlationId
);
break
;
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
...
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
const
std
::
vector
<
int
>
cbids
{
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
...
@@ -601,6 +611,8 @@ void initCuptiCbidStr() {
...
@@ -601,6 +611,8 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR
(
cudaStreamSynchronize_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaStreamSynchronize_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaStreamWaitEvent_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaStreamWaitEvent_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
cf0511f2
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler.pb.h"
...
@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
...
@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
}
class
Event
;
// DeviceTracer performs the following tasks:
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
cf0511f2
...
@@ -34,6 +34,7 @@ limitations under the License. */
...
@@ -34,6 +34,7 @@ limitations under the License. */
#include <type_traits>
#include <type_traits>
#include <utility>
#include <utility>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/port.h"
...
...
paddle/fluid/platform/event.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#endif
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
cf0511f2
...
@@ -39,6 +39,45 @@ class MKLDNNHandler {
...
@@ -39,6 +39,45 @@ class MKLDNNHandler {
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
}
}
// TODO(jczaja): extract common part and make AcquireMemory
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_weights_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
user_function
custom_func
=
{})
{
user_function
custom_func
=
{})
{
...
@@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
...
@@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
dims_
(
dims
),
dims_
(
dims
),
axis_
(
axis
),
axis_
(
axis
)
{}
logical_axis_
(
dims
.
size
(),
0
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
format
&
fmt
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for
(
size_t
i
=
0
;
i
<
logical_axis_
.
size
();
++
i
)
{
logical_axis_
[
i
]
=
i
;
}
auto
src_md
=
fmt
!=
mkldnn
::
memory
::
format
::
nchw
?
platform
::
MKLDNNMemDesc
(
dims_
,
platform
::
MKLDNNGetDataType
<
float
>
(),
fmt
)
:
Axis2MemoryDesc
(
dims_
,
logical_axis_
);
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
::
primitive_desc
{
src_md
,
engine_
},
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
framework
::
Tensor
*
output
,
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
framework
::
Tensor
*
output
,
platform
::
Place
place
)
{
platform
::
Place
place
)
{
...
@@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
...
@@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
private:
private:
std
::
vector
<
int
>
dims_
;
std
::
vector
<
int
>
dims_
;
std
::
vector
<
int
>
axis_
;
std
::
vector
<
int
>
axis_
;
std
::
vector
<
int
>
logical_axis_
;
};
};
template
<
class
forward_t
,
class
backward_data_t
,
class
backward_weights_t
>
template
<
class
forward_t
,
class
backward_data_t
,
class
backward_weights_t
>
...
@@ -548,9 +556,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
...
@@ -548,9 +556,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
(
weights_memory_p
.
get
()),
*
weights_memory_p
,
*
dst_memory_p
);
*
(
dst_memory_p
.
get
()));
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
}
else
{
...
@@ -570,9 +577,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
...
@@ -570,9 +577,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
())
,
*
weights_memory_p
,
*
bias_memory_p
,
*
(
bias_memory_p
.
get
()),
*
(
dst_memory_p
.
get
())
);
*
dst_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
}
else
{
...
...
paddle/fluid/platform/mkldnn_utils.h
0 → 100644
浏览文件 @
cf0511f2
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
namespace
paddle
{
namespace
platform
{
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_dims
(
const
std
::
vector
<
int
>&
ltz
,
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
data_type
::
f32
)
{
mkldnn_memory_desc_t
mem_fmt
;
mem_fmt
.
primitive_kind
=
mkldnn_memory
;
mem_fmt
.
ndims
=
ltz
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
ltz
.
size
();
++
i
)
{
mem_fmt
.
dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt
.
data_type
=
static_cast
<
mkldnn_data_type_t
>
(
data_type
);
mem_fmt
.
format
=
static_cast
<
mkldnn_memory_format_t
>
(
fmt
);
unsigned
int
total_stride
=
1
;
for
(
int
i
=
ltz
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
mem_fmt
.
layout_desc
.
blocking
.
padding_dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format, regardless physical
// layout)
mem_fmt
.
layout_desc
.
blocking
.
block_dims
[
i
]
=
1
;
mem_fmt
.
layout_desc
.
blocking
.
offset_padding_to_data
[
i
]
=
0
;
// no offset
mem_fmt
.
layout_desc
.
blocking
.
strides
[
0
][
i
]
=
total_stride
;
mem_fmt
.
layout_desc
.
blocking
.
strides
[
1
][
i
]
=
1
;
total_stride
*=
ltz
[
i
];
}
mem_fmt
.
layout_desc
.
blocking
.
offset_padding
=
0
;
// no initial offset
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
mem_fmt
,
cpu_engine
);
}
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_format
(
const
std
::
vector
<
int
>&
ltz
,
const
mkldnn
::
memory
::
format
format
,
const
mkldnn
::
memory
::
data_type
data_type
)
{
auto
md
=
mkldnn
::
memory
::
desc
({
ltz
},
data_type
,
format
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
PADDLE_ENFORCE_NOT_NULL
(
dev_ctx
,
"Could not get valid device"
);
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
md
,
cpu_engine
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler.cc
浏览文件 @
cf0511f2
...
@@ -112,12 +112,11 @@ double Event::CpuElapsedMs(const Event& e) const {
...
@@ -112,12 +112,11 @@ double Event::CpuElapsedMs(const Event& e) const {
}
}
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_CUPTI
return
gpu_ns_
/
1000000.0
;
return
gpu_ns_
/
1000000.0
;
#endif
#else
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
LOG_FIRST_N
(
WARNING
,
1
)
<<
"CUDA CUPTI is not enabled"
;
return
0
;
#endif
#endif
}
}
...
@@ -255,9 +254,11 @@ struct EventItem {
...
@@ -255,9 +254,11 @@ struct EventItem {
std
::
string
name
;
std
::
string
name
;
int
calls
;
int
calls
;
double
total_time
;
double
total_time
;
double
min_time
;
double
max_time
;
double
max_time
;
double
ave_time
;
double
ave_time
;
double
min_time
;
double
cpu_time
;
double
gpu_time
;
float
ratio
;
float
ratio
;
};
};
...
@@ -291,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
...
@@ -291,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Output events table
// Output events table
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
<<
"Calls"
<<
std
::
setw
(
data_width
)
<<
"Total"
<<
"Calls"
<<
std
::
setw
(
data_width
)
<<
"Total"
;
<<
std
::
setw
(
data_width
)
<<
"Min."
<<
std
::
setw
(
data_width
)
if
(
g_state
==
ProfilerState
::
kAll
)
{
std
::
cout
<<
std
::
setw
(
data_width
*
2
)
<<
"CPU Time (Ratio)"
<<
std
::
setw
(
data_width
*
2
)
<<
"GPU Time (Ratio)"
;
}
std
::
cout
<<
std
::
setw
(
data_width
)
<<
"Min."
<<
std
::
setw
(
data_width
)
<<
"Max."
<<
std
::
setw
(
data_width
)
<<
"Ave."
<<
"Max."
<<
std
::
setw
(
data_width
)
<<
"Ave."
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
...
@@ -300,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
...
@@ -300,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
;
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
if
(
g_state
==
ProfilerState
::
kAll
)
{
std
::
cout
<<
std
::
setw
(
data_width
*
2
)
<<
string
::
Sprintf
(
"%f (%f)"
,
event_item
.
cpu_time
,
(
event_item
.
cpu_time
/
event_item
.
total_time
))
<<
std
::
setw
(
data_width
*
2
)
<<
string
::
Sprintf
(
"%f (%f)"
,
event_item
.
gpu_time
,
(
event_item
.
gpu_time
/
event_item
.
total_time
));
}
std
::
cout
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
max_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
max_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ave_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ave_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ratio
<<
std
::
endl
;
<<
std
::
setw
(
data_width
)
<<
event_item
.
ratio
<<
std
::
endl
;
...
@@ -350,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -350,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
return
a
.
ave_time
>
b
.
ave_time
;
return
a
.
ave_time
>
b
.
ave_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kGPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
gpu_time
>
b
.
gpu_time
;
};
break
;
case
EventSortingKey
::
kCPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
cpu_time
>
b
.
cpu_time
;
};
break
;
default:
default:
sorted_domain
=
"event first end time"
;
sorted_domain
=
"event first end time"
;
}
}
...
@@ -388,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -388,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
}
if
(
rit
!=
pushed_events
.
rend
())
{
if
(
rit
!=
pushed_events
.
rend
())
{
double
event_time
=
(
g_state
==
ProfilerState
::
kCUDA
||
double
event_time
=
0
;
g_state
==
ProfilerState
::
kAll
)
double
gpu_time
=
rit
->
CudaElapsedMs
((
*
analyze_events
)[
i
][
j
]);
?
rit
->
CudaElapsedMs
((
*
analyze_events
)[
i
][
j
])
double
cpu_time
=
rit
->
CpuElapsedMs
((
*
analyze_events
)[
i
][
j
]);
:
rit
->
CpuElapsedMs
((
*
analyze_events
)[
i
][
j
]);
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
event_time
=
gpu_time
;
}
else
if
(
g_state
==
ProfilerState
::
kCPU
)
{
event_time
=
cpu_time
;
}
else
{
event_time
=
gpu_time
+
cpu_time
;
}
total
+=
event_time
;
total
+=
event_time
;
std
::
string
event_name
;
std
::
string
event_name
;
...
@@ -408,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -408,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
event_idx
[
event_name
]
=
event_items
.
size
();
event_idx
[
event_name
]
=
event_items
.
size
();
EventItem
event_item
=
{
event_name
,
1
,
event_time
,
EventItem
event_item
=
{
event_name
,
1
,
event_time
,
event_time
,
event_time
,
event_time
,
event_time
,
event_time
,
event_time
,
0.
};
gpu_time
,
cpu_time
,
0.
};
event_items
.
push_back
(
event_item
);
event_items
.
push_back
(
event_item
);
}
else
{
}
else
{
int
index
=
event_idx
[
event_name
];
int
index
=
event_idx
[
event_name
];
...
@@ -421,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -421,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// max time
// max time
event_items
[
index
].
max_time
=
event_items
[
index
].
max_time
=
std
::
max
(
event_time
,
event_items
[
index
].
max_time
);
std
::
max
(
event_time
,
event_items
[
index
].
max_time
);
event_items
[
index
].
gpu_time
+=
gpu_time
;
event_items
[
index
].
cpu_time
+=
cpu_time
;
}
}
// remove the push marker from the list
// remove the push marker from the list
...
...
paddle/fluid/platform/profiler.cu
浏览文件 @
cf0511f2
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
#include <cuda.h>
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -22,26 +21,27 @@ namespace platform {
...
@@ -22,26 +21,27 @@ namespace platform {
__global__
void
DummyKernel
(
int
*
a
)
{
a
[
0
]
=
0
;
}
__global__
void
DummyKernel
(
int
*
a
)
{
a
[
0
]
=
0
;
}
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
auto
original_device
=
GetCurrentDeviceId
();
auto
original_device
=
platform
::
GetCurrentDeviceId
();
int
count
=
GetCUDADeviceCount
();
int
count
=
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetDeviceId
(
i
);
platform
::
SetDeviceId
(
i
);
func
(
i
);
func
(
i
);
}
}
SetDeviceId
(
original_device
);
platform
::
SetDeviceId
(
original_device
);
}
}
void
DummyKernelAndEvent
()
{
void
DummyKernelAndEvent
()
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ForEachDevice
([](
int
d
)
{
ForEachDevice
([](
int
d
)
{
CUDADeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDAPlace
(
d
));
platform
::
SetDeviceId
(
d
);
cudaStream_t
stream
;
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
Mark
(
"_cuda_startup_"
);
Mark
(
"_cuda_startup_"
);
int
*
ptr
;
int
*
ptr
;
PADDLE_ENFORCE
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
PADDLE_ENFORCE
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
DummyKernel
<<<
1
,
1
,
0
,
dev_ctx
->
stream
()
>>>
(
ptr
);
DummyKernel
<<<
1
,
1
,
0
,
stream
>>>
(
ptr
);
dev_ctx
->
Wait
(
);
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
)
);
PADDLE_ENFORCE
(
cudaFree
(
ptr
));
PADDLE_ENFORCE
(
cudaFree
(
ptr
));
delete
dev_ctx
;
});
});
}
}
}
}
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
cf0511f2
...
@@ -17,54 +17,13 @@ limitations under the License. */
...
@@ -17,54 +17,13 @@ limitations under the License. */
#include <list>
#include <list>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
#include "paddle/fluid/platform/gpu_info.h"
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#endif
#endif
};
namespace
paddle
{
namespace
platform
{
enum
ProfilerState
{
enum
ProfilerState
{
kDisabled
,
// disabled state
kDisabled
,
// disabled state
...
@@ -117,7 +76,16 @@ struct RecordBlock {
...
@@ -117,7 +76,16 @@ struct RecordBlock {
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
// Candidate keys to sort the profiling report
// Candidate keys to sort the profiling report
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
};
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
,
kCPUTime
,
kGPUTime
};
// Enable the profiling function.
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
void
EnableProfiler
(
ProfilerState
state
);
...
...
paddle/fluid/platform/profiler_test.cc
浏览文件 @
cf0511f2
...
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
...
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
}
}
TEST
(
RecordEvent
,
RecordEvent
)
{
TEST
(
RecordEvent
,
RecordEvent
)
{
using
paddle
::
platform
::
DeviceContext
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventType
;
using
paddle
::
platform
::
EventType
;
using
paddle
::
platform
::
RecordEvent
;
using
paddle
::
platform
::
RecordEvent
;
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
cf0511f2
...
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
...
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
framework
::
Tensor
tensor
=
...
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
...
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
...
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
...
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
framework
::
Tensor
out_side_tensor
;
...
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
...
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
size_t
memory_size
=
500
;
size_t
memory_size
=
500
;
int
numel
=
memory_size
/
sizeof
(
float
);
int
numel
=
memory_size
/
sizeof
(
float
);
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
cf0511f2
...
@@ -34,7 +34,7 @@ void BindTracer(pybind11::module* m) {
...
@@ -34,7 +34,7 @@ void BindTracer(pybind11::module* m) {
framework
::
BlockDesc
*
block
,
framework
::
BlockDesc
*
block
,
const
platform
::
CPUPlace
expected_place
,
const
platform
::
CPUPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
return
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
stop_gradient
);
})
})
.
def
(
"trace"
,
.
def
(
"trace"
,
...
@@ -44,7 +44,7 @@ void BindTracer(pybind11::module* m) {
...
@@ -44,7 +44,7 @@ void BindTracer(pybind11::module* m) {
framework
::
BlockDesc
*
block
,
framework
::
BlockDesc
*
block
,
const
platform
::
CUDAPlace
expected_place
,
const
platform
::
CUDAPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
return
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
stop_gradient
);
})
})
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
...
...
paddle/fluid/pybind/ir.cc
浏览文件 @
cf0511f2
...
@@ -101,7 +101,8 @@ void BindGraph(py::module *m) {
...
@@ -101,7 +101,8 @@ void BindGraph(py::module *m) {
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
return_value_policy
::
reference
)
return_value_policy
::
reference
)
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
);
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
)
.
def
(
"origin_program_desc"
,
&
Graph
::
OriginProgram
);
}
}
void
BindNode
(
py
::
module
*
m
)
{
void
BindNode
(
py
::
module
*
m
)
{
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
cf0511f2
...
@@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) {
...
@@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) {
return
self
.
HasVar
(
name
);
return
self
.
HasVar
(
name
);
},
},
pybind11
::
return_value_policy
::
reference
)
pybind11
::
return_value_policy
::
reference
)
.
def
(
"_clear_block"
,
[](
pd
::
BlockDesc
&
self
)
{
return
self
.
Clear
();
},
pybind11
::
return_value_policy
::
reference
)
.
def
(
"_rename_var"
,
.
def
(
"_rename_var"
,
[](
pd
::
BlockDesc
&
self
,
const
pybind11
::
bytes
&
byte_name
,
[](
pd
::
BlockDesc
&
self
,
const
pybind11
::
bytes
&
byte_name
,
const
pybind11
::
bytes
&
byte_name_new
)
{
const
pybind11
::
bytes
&
byte_name_new
)
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
cf0511f2
...
@@ -976,6 +976,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -976,6 +976,7 @@ All parameter, weight, gradient are variables in Paddle.
[](
ir
::
PassBuilder
&
self
,
size_t
idx
)
{
self
.
RemovePass
(
idx
);
});
[](
ir
::
PassBuilder
&
self
,
size_t
idx
)
{
self
.
RemovePass
(
idx
);
});
// -- python binds for parallel executor.
// -- python binds for parallel executor.
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
ExecutionStrategy allows the user to more preciously control how to run
...
@@ -1227,9 +1228,9 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1227,9 +1228,9 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"
);
cannot be updated after being finalized.)DOC"
);
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
std
::
string
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
>
())
const
BuildStrategy
&
,
ir
::
Graph
*
>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
paddle/fluid/train/demo/demo_trainer.cc
浏览文件 @
cf0511f2
...
@@ -73,7 +73,7 @@ int main() {
...
@@ -73,7 +73,7 @@ int main() {
PADDLE_ENFORCE_NE
(
loss_name
,
""
,
"loss not found"
);
PADDLE_ENFORCE_NE
(
loss_name
,
""
,
"loss not found"
);
// init all parameters
// init all parameters
executor
.
Run
(
*
startup_program
.
get
()
,
&
scope
,
0
);
executor
.
Run
(
*
startup_program
,
&
scope
,
0
);
// prepare data
// prepare data
auto
x_var
=
scope
.
Var
(
"x"
);
auto
x_var
=
scope
.
Var
(
"x"
);
...
@@ -101,7 +101,7 @@ int main() {
...
@@ -101,7 +101,7 @@ int main() {
clock_t
t1
=
clock
();
clock_t
t1
=
clock
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
std
::
cout
<<
"step: "
<<
i
<<
" loss: "
std
::
cout
<<
"step: "
<<
i
<<
" loss: "
<<
loss_var
->
Get
<
paddle
::
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
]
<<
loss_var
->
Get
<
paddle
::
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
]
<<
std
::
endl
;
<<
std
::
endl
;
...
...
paddle/fluid/train/test_train_recognize_digits.cc
浏览文件 @
cf0511f2
...
@@ -74,7 +74,7 @@ void Train() {
...
@@ -74,7 +74,7 @@ void Train() {
float
first_loss
=
0.0
;
float
first_loss
=
0.0
;
float
last_loss
=
0.0
;
float
last_loss
=
0.0
;
for
(
int
i
=
0
;
i
<
100
;
++
i
)
{
for
(
int
i
=
0
;
i
<
100
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
if
(
i
==
0
)
{
if
(
i
==
0
)
{
first_loss
=
loss_var
->
Get
<
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
];
first_loss
=
loss_var
->
Get
<
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
];
}
else
if
(
i
==
99
)
{
}
else
if
(
i
==
99
)
{
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
cf0511f2
...
@@ -444,6 +444,7 @@ function assert_api_spec_approvals() {
...
@@ -444,6 +444,7 @@ function assert_api_spec_approvals() {
"paddle/fluid/framework/ir/node.h"
"paddle/fluid/framework/ir/node.h"
"paddle/fluid/framework/ir/graph.h"
"paddle/fluid/framework/ir/graph.h"
"paddle/fluid/framework/framework.proto"
"paddle/fluid/framework/framework.proto"
"python/paddle/fluid/compiler.py"
"paddle/fluid/operators/distributed/send_recv.proto.in"
)
"paddle/fluid/operators/distributed/send_recv.proto.in"
)
for
API_FILE
in
${
API_FILES
[*]
}
;
do
for
API_FILE
in
${
API_FILES
[*]
}
;
do
API_CHANGE
=
`
git diff
--name-only
upstream/
$BRANCH
|
grep
"
${
API_FILE
}
"
||
true
`
API_CHANGE
=
`
git diff
--name-only
upstream/
$BRANCH
|
grep
"
${
API_FILE
}
"
||
true
`
...
...
python/paddle/fluid/compiler.py
浏览文件 @
cf0511f2
...
@@ -17,8 +17,10 @@ import os
...
@@ -17,8 +17,10 @@ import os
import
six
import
six
import
sys
import
sys
from
..
import
compat
as
cpt
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
core
from
.
import
core
from
.
import
framework
__all__
=
[
'CompiledProgram'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
__all__
=
[
'CompiledProgram'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
...
@@ -36,7 +38,7 @@ def _place_obj(place):
...
@@ -36,7 +38,7 @@ def _place_obj(place):
class
CompiledProgram
(
object
):
class
CompiledProgram
(
object
):
"""
"""
Compiles
a Program
for execution.
Compiles
to Graph
for execution.
1. Users first create the program with layers.
1. Users first create the program with layers.
2. Optionally, users use CompiledProgram to optimize the program before run.
2. Optionally, users use CompiledProgram to optimize the program before run.
...
@@ -51,7 +53,7 @@ class CompiledProgram(object):
...
@@ -51,7 +53,7 @@ class CompiledProgram(object):
Example:
Example:
.. code-block:: python
.. code-block:: python
place = fluid.CUDAPlace(0) if use_
cuda
else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_
gpu
else fluid.CPUPlace()
exe = fluid.Executor(place)
exe = fluid.Executor(place)
exe.run(startup)
exe.run(startup)
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
...
@@ -62,11 +64,25 @@ class CompiledProgram(object):
...
@@ -62,11 +64,25 @@ class CompiledProgram(object):
fetch_list=[loss.name])
fetch_list=[loss.name])
Args:
Args:
program: Program instance that contains the model logic.
program_or_graph (Graph|Program): If it's Program, it will be first
lowered to a graph for further optimizations. If it's a graph
(potentially optimized before), it will be directly used for
further optimizations. Note: graph is only supported when compiled
with with_data_parallel option.
"""
"""
def
__init__
(
self
,
program
):
def
__init__
(
self
,
program_or_graph
):
self
.
_program
=
program
if
isinstance
(
program_or_graph
,
core
.
Graph
):
self
.
_graph
=
program_or_graph
self
.
_program
=
None
elif
isinstance
(
program_or_graph
,
framework
.
Program
):
self
.
_graph
=
core
.
Graph
(
program_or_graph
.
desc
)
self
.
_program
=
program_or_graph
else
:
raise
ValueError
(
"Wrong program_to_graph type: %s"
%
type
(
program_or_graph
))
self
.
_program_desc
=
self
.
_graph
.
origin_program_desc
()
self
.
_scope
=
None
self
.
_scope
=
None
self
.
_place
=
None
self
.
_place
=
None
self
.
_executor
=
None
self
.
_executor
=
None
...
@@ -101,6 +117,7 @@ class CompiledProgram(object):
...
@@ -101,6 +117,7 @@ class CompiledProgram(object):
self
self
"""
"""
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
assert
not
self
.
_is_inference
,
"Cannot compile both data parallel and inference"
self
.
_is_data_parallel
=
True
self
.
_is_data_parallel
=
True
self
.
_build_strategy
=
build_strategy
self
.
_build_strategy
=
build_strategy
self
.
_exec_strategy
=
exec_strategy
self
.
_exec_strategy
=
exec_strategy
...
@@ -110,6 +127,8 @@ class CompiledProgram(object):
...
@@ -110,6 +127,8 @@ class CompiledProgram(object):
self
.
_exec_strategy
=
ExecutionStrategy
()
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
self
.
_program
)
return
self
return
self
def
with_inference_optimize
(
self
,
config
):
def
with_inference_optimize
(
self
,
config
):
...
@@ -120,11 +139,13 @@ class CompiledProgram(object):
...
@@ -120,11 +139,13 @@ class CompiledProgram(object):
Returns:
Returns:
self
self
"""
"""
assert
not
self
.
_is_data_parallel
,
"Cannot compile both data parallel and inference"
assert
not
self
.
_is_inference
,
"Already compiled with inference"
assert
any
([
assert
any
([
isinstance
(
config
,
InferNativeConfig
),
isinstance
(
config
,
InferNativeConfig
),
isinstance
(
config
,
InferAnalysisConfig
)
isinstance
(
config
,
InferAnalysisConfig
)
])
])
self
.
_is_data_parallel
=
False
self
.
_is_inference
=
True
self
.
_is_inference
=
True
self
.
_infer_config
=
config
self
.
_infer_config
=
config
return
self
return
self
...
@@ -173,37 +194,41 @@ class CompiledProgram(object):
...
@@ -173,37 +194,41 @@ class CompiledProgram(object):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
trainers_endpoints
=
self
.
_program
.
_trainers_endpoints
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
if
self
.
_build_strategy
.
memory_optimize
is
None
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
None
:
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
if
self
.
_program
and
self
.
_build_strategy
.
num_trainers
>
1
and
\
self
.
_program
.
_trainers_endpoints
:
tps
=
self
.
_program
.
_trainers_endpoints
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
assert
self
.
_build_strategy
.
num_trainers
==
len
(
t
rainers_endpoint
s
),
"num_trainers == len(end_points)"
t
p
s
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
t
rainers_endpoint
s
self
.
_build_strategy
.
trainers_endpoints
=
t
p
s
self
.
_persistable_vars
=
set
([
self
.
_persistable_vars
=
[]
cpt
.
to_text
(
v
.
name
)
for
block_id
in
range
(
self
.
_program_desc
.
num_blocks
()):
for
v
in
[
bdesc
=
self
.
_program_desc
.
block
(
block_id
)
var
for
var
in
self
.
_program
.
list_vars
()
self
.
_persistable_vars
.
extend
([
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
cpt
.
to_text
(
v
.
name
())
for
v
in
bdesc
.
all_vars
()
]
if
v
.
persistable
()
and
v
.
type
()
!=
core
.
VarDesc
.
VarType
.
RAW
])
])
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
self
.
_program
.
desc
,
places
,
set
(
self
.
_persistable_vars
),
cpt
.
to_text
(
self
.
_loss_name
)
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
def
_compile_inference
(
self
):
def
_compile_inference
(
self
):
assert
self
.
_is_data_parallel
is
False
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
def
_compile
(
self
,
scope
,
place
):
def
_compile
(
self
,
scope
,
place
):
...
...
python/paddle/fluid/executor.py
浏览文件 @
cf0511f2
...
@@ -538,6 +538,8 @@ class Executor(object):
...
@@ -538,6 +538,8 @@ class Executor(object):
else
:
else
:
# TODO(panyx0718): Can compile program to optimize executor
# TODO(panyx0718): Can compile program to optimize executor
# performance.
# performance.
# TODO(panyx0718): executor should be able to run graph.
assert
program
.
_program
,
"CompiledProgram is compiled from graph, can only run with_data_parallel."
return
self
.
_run
(
return
self
.
_run
(
program
.
_program
,
program
.
_program
,
self
.
_default_executor
,
self
.
_default_executor
,
...
...
python/paddle/fluid/framework.py
浏览文件 @
cf0511f2
...
@@ -87,6 +87,15 @@ def _current_expected_place():
...
@@ -87,6 +87,15 @@ def _current_expected_place():
return
_imperative_current_expected_place_
return
_imperative_current_expected_place_
def
is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
NameScope
(
object
):
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
self
.
_children
=
dict
()
...
@@ -378,16 +387,19 @@ class Variable(object):
...
@@ -378,16 +387,19 @@ class Variable(object):
# get_capacity is implemented
# get_capacity is implemented
pass
pass
self
.
block
.
vars
[
name
]
=
self
self
.
op
=
None
self
.
stop_gradient
=
stop_gradient
self
.
is_data
=
is_data
if
_in_imperative_mode
():
if
_in_imperative_mode
():
# record vars in tracer rather than blocks
self
.
_ivar
=
kwargs
.
get
(
"ivar"
,
None
)
self
.
_ivar
=
kwargs
.
get
(
"ivar"
,
None
)
if
not
self
.
_ivar
:
if
not
self
.
_ivar
:
self
.
_ivar
=
core
.
VarBase
()
self
.
_ivar
=
core
.
VarBase
(
stop_gradient
)
self
.
_ivar
.
desc
=
self
.
desc
self
.
_ivar
.
desc
=
self
.
desc
self
.
_ivar
.
stop_gradient
=
stop_gradient
if
persistable
:
self
.
block
.
vars
[
name
]
=
self
else
:
self
.
block
.
vars
[
name
]
=
self
self
.
op
=
None
self
.
stop_gradient
=
stop_gradient
self
.
is_data
=
is_data
def
_numpy
(
self
):
def
_numpy
(
self
):
new_ivar
=
self
.
_ivar
.
_copy_to
(
core
.
CPUPlace
(),
True
)
new_ivar
=
self
.
_ivar
.
_copy_to
(
core
.
CPUPlace
(),
True
)
...
@@ -723,7 +735,6 @@ class Operator(object):
...
@@ -723,7 +735,6 @@ class Operator(object):
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
desc
.
check_attrs
()
self
.
desc
.
check_attrs
()
if
self
.
_has_kernel
(
type
):
if
self
.
_has_kernel
(
type
):
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
...
@@ -731,6 +742,7 @@ class Operator(object):
...
@@ -731,6 +742,7 @@ class Operator(object):
if
_in_imperative_mode
():
if
_in_imperative_mode
():
self
.
iop
=
core
.
OpBase
()
self
.
iop
=
core
.
OpBase
()
self
.
iop
.
desc
=
self
.
desc
self
.
iop
.
desc
=
self
.
desc
self
.
inputs
=
defaultdict
(
list
)
self
.
inputs
=
defaultdict
(
list
)
if
inputs
is
not
None
:
if
inputs
is
not
None
:
for
k
,
v
in
six
.
iteritems
(
inputs
):
for
k
,
v
in
six
.
iteritems
(
inputs
):
...
@@ -738,6 +750,7 @@ class Operator(object):
...
@@ -738,6 +750,7 @@ class Operator(object):
self
.
inputs
[
k
].
append
(
v
.
_ivar
)
self
.
inputs
[
k
].
append
(
v
.
_ivar
)
elif
isinstance
(
v
,
list
)
or
isinstance
(
v
,
tuple
):
elif
isinstance
(
v
,
list
)
or
isinstance
(
v
,
tuple
):
self
.
inputs
[
k
].
extend
([
var
.
_ivar
for
var
in
v
])
self
.
inputs
[
k
].
extend
([
var
.
_ivar
for
var
in
v
])
self
.
outputs
=
defaultdict
(
list
)
self
.
outputs
=
defaultdict
(
list
)
if
outputs
is
not
None
:
if
outputs
is
not
None
:
for
k
,
v
in
six
.
iteritems
(
outputs
):
for
k
,
v
in
six
.
iteritems
(
outputs
):
...
@@ -1187,6 +1200,15 @@ class Block(object):
...
@@ -1187,6 +1200,15 @@ class Block(object):
else
:
else
:
raise
ValueError
(
"Var {0} is not found recursively"
.
format
(
name
))
raise
ValueError
(
"Var {0} is not found recursively"
.
format
(
name
))
def
_clear_block
(
self
):
# TODO(minqiyang): move this to backward_hooks
self
.
desc
.
_clear_block
()
for
name
in
self
.
vars
.
keys
():
assert
self
.
vars
[
name
].
persistable
del
self
.
ops
[:]
def
all_parameters
(
self
):
def
all_parameters
(
self
):
return
list
(
self
.
iter_parameters
())
return
list
(
self
.
iter_parameters
())
...
@@ -1317,18 +1339,31 @@ class Block(object):
...
@@ -1317,18 +1339,31 @@ class Block(object):
inputs
=
kwargs
.
get
(
"inputs"
,
None
),
inputs
=
kwargs
.
get
(
"inputs"
,
None
),
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
self
.
ops
.
append
(
op
)
# TODO(minqiyang): add stop_gradient support in static mode too.
if
_in_imperative_mode
():
# record ops in tracer rather than blocks
#
# TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in imperative mode.
# currently, we only support stop_gradient in imperative mode.
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
self
.
ops
.
append
(
op
)
return
op
return
op
def
_trace_op
(
self
,
op
,
stop_gradient
=
False
):
def
_trace_op
(
self
,
op
,
stop_gradient
=
False
):
if
_in_imperative_mode
():
backward_refs
=
_imperative_tracer
().
trace
(
_imperative_tracer
().
trace
(
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
_imperative_current_expected_place_
,
_imperative_current_expected_place_
,
stop_gradient
)
stop_gradient
)
# TODO(minqiyang): support backward_hooks to eager remove backward_refs
op
.
backward_refs
=
defaultdict
(
list
)
for
k
,
v
in
six
.
iteritems
(
op
.
inputs
):
if
k
in
backward_refs
:
op
.
backward_refs
[
k
]
=
op
.
inputs
[
k
]
for
k
,
v
in
six
.
iteritems
(
op
.
outputs
):
if
k
in
backward_refs
:
op
.
backward_refs
[
k
]
=
op
.
outputs
[
k
]
def
_insert_op
(
self
,
index
,
*
args
,
**
kwargs
):
def
_insert_op
(
self
,
index
,
*
args
,
**
kwargs
):
"""
"""
...
@@ -1383,6 +1418,7 @@ class Block(object):
...
@@ -1383,6 +1418,7 @@ class Block(object):
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
self
.
ops
.
insert
(
0
,
op
)
self
.
ops
.
insert
(
0
,
op
)
if
_in_imperative_mode
():
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
return
op
return
op
...
...
python/paddle/fluid/imperative/layers.py
浏览文件 @
cf0511f2
...
@@ -17,7 +17,7 @@ import contextlib
...
@@ -17,7 +17,7 @@ import contextlib
import
sys
import
sys
import
numpy
as
np
import
numpy
as
np
import
collections
import
collections
from
..
import
unique_name
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
paddle.fluid
import
framework
from
paddle.fluid
import
framework
from
paddle.fluid.imperative
import
base
from
paddle.fluid.imperative
import
base
...
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']
...
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']
class
Layer
(
core
.
Layer
):
class
Layer
(
core
.
Layer
):
"""Layers composed of operators."""
"""Layers composed of operators.
Args:
name_scope: prefix name used by the layer to name parameters.
If prefix is "my_model/layer_1", parameter name in MyLayer
can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
base name and n is an unique suffix auto-generated.
dtype: data type for the variables in the layer.
"""
def
__init__
(
self
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
name
=
None
):
def
__init__
(
self
,
name_scope
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
self
.
_full_name
=
unique_name
.
generate
(
name_scope
+
"/"
+
self
.
__class__
.
__name__
)
self
.
_built
=
False
self
.
_built
=
False
self
.
_dtype
=
dtype
self
.
_dtype
=
dtype
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
def
full_name
(
self
):
"""Full name for this layers.
Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
Returns full name of this name.
"""
return
self
.
_full_name
def
parameters
(
self
,
include_sublayers
=
True
):
def
parameters
(
self
,
include_sublayers
=
True
):
"""Returns a list of Parameters from current and sub-layers.
"""Returns a list of Parameters from current and sub-layers.
...
...
python/paddle/fluid/imperative/nn.py
浏览文件 @
cf0511f2
...
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
...
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
class
Conv2D
(
layers
.
Layer
):
class
Conv2D
(
layers
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
num_channels
,
num_channels
,
num_filters
,
num_filters
,
filter_size
,
filter_size
,
...
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
...
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
act
=
None
,
act
=
None
,
param_attr
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
bias_attr
=
None
,
name
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
assert
param_attr
is
not
False
,
"param_attr should not be False here."
assert
param_attr
is
not
False
,
"param_attr should not be False here."
super
(
Conv2D
,
self
).
__init__
(
name
=
nam
e
,
dtype
=
dtype
)
super
(
Conv2D
,
self
).
__init__
(
name
_scop
e
,
dtype
=
dtype
)
# TODO(minqiyang): Move this to the top.
# TODO(minqiyang): Move this to the top.
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
_helper
=
LayerHelper
(
type
(
self
).
__name__
,
self
.
full_name
()
,
param_attr
=
param_attr
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
bias_attr
=
bias_attr
,
dtype
=
dtype
,
dtype
=
dtype
,
name
=
name
,
act
=
act
)
act
=
act
)
self
.
_groups
=
groups
self
.
_groups
=
groups
...
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
...
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
class
Pool2D
(
layers
.
Layer
):
class
Pool2D
(
layers
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
pool_size
=-
1
,
pool_size
=-
1
,
pool_type
=
"max"
,
pool_type
=
"max"
,
pool_stride
=
1
,
pool_stride
=
1
,
...
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
...
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
use_cudnn
=
True
,
use_cudnn
=
True
,
ceil_mode
=
False
,
ceil_mode
=
False
,
exclusive
=
True
,
exclusive
=
True
,
name
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
if
pool_type
not
in
[
"max"
,
"avg"
]:
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
raise
ValueError
(
...
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
...
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
if
not
isinstance
(
use_cudnn
,
bool
):
if
not
isinstance
(
use_cudnn
,
bool
):
raise
ValueError
(
"use_cudnn should be True or False"
)
raise
ValueError
(
"use_cudnn should be True or False"
)
super
(
Pool2D
,
self
).
__init__
(
name
=
nam
e
,
dtype
=
dtype
)
super
(
Pool2D
,
self
).
__init__
(
name
_scop
e
,
dtype
=
dtype
)
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
type
(
self
).
__name__
,
dtype
=
dtype
,
name
=
nam
e
)
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
dtype
=
dtyp
e
)
self
.
_pool_type
=
pool_type
self
.
_pool_type
=
pool_type
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
...
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
...
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
class
FC
(
layers
.
Layer
):
class
FC
(
layers
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
size
,
size
,
param_attr
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
bias_attr
=
None
,
num_flatten_dims
=
1
,
num_flatten_dims
=
1
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
act
=
None
,
act
=
None
):
name
=
None
):
super
(
FC
,
self
).
__init__
(
name_scope
)
super
(
FC
,
self
).
__init__
()
self
.
_size
=
size
self
.
_size
=
size
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_dtype
=
dtype
self
.
_dtype
=
dtype
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
_helper
=
LayerHelper
(
'FC'
,
self
.
full_name
()
,
param_attr
=
param_attr
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
bias_attr
=
bias_attr
,
act
=
act
,
act
=
act
)
name
=
name
)
def
_build_once
(
self
,
input
):
def
_build_once
(
self
,
input
):
input_shape
=
input
.
shape
input_shape
=
input
.
shape
...
@@ -282,6 +280,7 @@ class FC(layers.Layer):
...
@@ -282,6 +280,7 @@ class FC(layers.Layer):
class
BatchNorm
(
layers
.
Layer
):
class
BatchNorm
(
layers
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
num_channels
,
num_channels
,
act
=
None
,
act
=
None
,
is_test
=
False
,
is_test
=
False
,
...
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
...
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
data_layout
=
'NCHW'
,
data_layout
=
'NCHW'
,
in_place
=
False
,
in_place
=
False
,
name
=
None
,
moving_mean_name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
moving_variance_name
=
None
,
do_model_average_for_mean_and_var
=
False
,
do_model_average_for_mean_and_var
=
False
,
fuse_with_relu
=
False
,
fuse_with_relu
=
False
,
use_global_stats
=
False
):
use_global_stats
=
False
):
super
(
BatchNorm
,
self
).
__init__
()
super
(
BatchNorm
,
self
).
__init__
(
name_scope
)
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
_helper
=
LayerHelper
(
'batch_norm'
,
self
.
full_name
()
,
param_attr
=
param_attr
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
bias_attr
=
bias_attr
,
name
=
name
,
act
=
act
)
act
=
act
)
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
...
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
...
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
constructor.
constructor.
Args:
Args:
name_scope: See base class.
size(tuple|list): The shape of the look up table parameter. It should
size(tuple|list): The shape of the look up table parameter. It should
have two elements which indicate the size of the dictionary of
have two elements which indicate the size of the dictionary of
embeddings and the size of each embedding vector respectively.
embeddings and the size of each embedding vector respectively.
...
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
...
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
size
,
size
,
is_sparse
=
False
,
is_sparse
=
False
,
is_distributed
=
False
,
is_distributed
=
False
,
...
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
...
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
param_attr
=
None
,
param_attr
=
None
,
dtype
=
'float32'
):
dtype
=
'float32'
):
super
(
Embedding
,
self
).
__init__
()
super
(
Embedding
,
self
).
__init__
(
name_scope
)
self
.
_size
=
size
self
.
_size
=
size
self
.
_is_sparse
=
is_sparse
self
.
_is_sparse
=
is_sparse
self
.
_is_distributed
=
is_distributed
self
.
_is_distributed
=
is_distributed
...
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
...
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'embedding'
,
param_attr
=
param_attr
)
self
.
_helper
=
LayerHelper
(
self
.
full_name
()
,
param_attr
=
param_attr
)
self
.
_w
=
self
.
_helper
.
create_parameter
(
self
.
_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_param_attr
,
attr
=
self
.
_param_attr
,
shape
=
self
.
_size
,
shape
=
self
.
_size
,
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
cf0511f2
...
@@ -34,6 +34,9 @@ class LayerHelper(object):
...
@@ -34,6 +34,9 @@ class LayerHelper(object):
self
.
kwargs
=
kwargs
self
.
kwargs
=
kwargs
self
.
layer_type
=
layer_type
self
.
layer_type
=
layer_type
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
# TODO(panyx0718, minqiyang): imperative mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode.
if
name
is
None
:
if
name
is
None
:
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
self
.
layer_type
)
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
self
.
layer_type
)
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
cf0511f2
...
@@ -551,9 +551,10 @@ def yolov3_loss(x,
...
@@ -551,9 +551,10 @@ def yolov3_loss(x,
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2]
anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
"""
"""
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
cf0511f2
...
@@ -87,6 +87,7 @@ __all__ = [
...
@@ -87,6 +87,7 @@ __all__ = [
'transpose'
,
'transpose'
,
'im2sequence'
,
'im2sequence'
,
'nce'
,
'nce'
,
'sampled_softmax_with_cross_entropy'
,
'hsigmoid'
,
'hsigmoid'
,
'beam_search'
,
'beam_search'
,
'row_conv'
,
'row_conv'
,
...
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
...
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
candidate_activation
=
'tanh'
,
candidate_activation
=
'tanh'
,
proj_activation
=
'tanh'
,
proj_activation
=
'tanh'
,
dtype
=
'float32'
,
dtype
=
'float32'
,
name
=
None
):
name
=
None
,
h_0
=
None
,
c_0
=
None
,
cell_clip
=
None
,
proj_clip
=
None
):
"""
"""
**Dynamic LSTMP Layer**
**Dynamic LSTMP Layer**
...
@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
...
@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
will be named automatically.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the projection size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
cell_clip(float): If provided the cell state is clipped
by this value prior to the cell output activation.
proj_clip(float): If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`.
Returns:
Returns:
tuple: A tuple of two output variable: the projection of hidden state,
\
tuple: A tuple of two output variable: the projection of hidden state,
\
...
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
...
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
batch_hidden
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_hidden
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_gate
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_gate
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_cell_pre_act
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_cell_pre_act
=
helper
.
create_variable_for_type_inference
(
dtype
)
inputs
=
{
helper
.
append_op
(
type
=
'lstmp'
,
inputs
=
{
'Input'
:
input
,
'Input'
:
input
,
'Weight'
:
weight
,
'Weight'
:
weight
,
'ProjWeight'
:
proj_weight
,
'ProjWeight'
:
proj_weight
,
'Bias'
:
bias
'Bias'
:
bias
},
}
batch_size
=
input
.
shape
[
0
]
if
h_0
:
assert
h_0
.
shape
==
(
batch_size
,
proj_size
),
\
'The shape of h0 should be (batch_size, %d)'
%
proj_size
inputs
[
'H0'
]
=
h_0
if
c_0
:
assert
c_0
.
shape
==
(
batch_size
,
size
),
\
'The shape of c0 should be (batch_size, %d)'
%
size
inputs
[
'C0'
]
=
c_0
if
cell_clip
:
assert
cell_clip
>=
0
,
"cell_clip should not be negtive."
if
proj_clip
:
assert
proj_clip
>=
0
,
"proj_clip should not be negtive."
helper
.
append_op
(
type
=
'lstmp'
,
inputs
=
inputs
,
outputs
=
{
outputs
=
{
'Projection'
:
projection
,
'Projection'
:
projection
,
'Cell'
:
cell
,
'Cell'
:
cell
,
'OrderedP0'
:
ordered_proj0
,
'BatchHidden'
:
batch_hidden
,
'BatchHidden'
:
batch_hidden
,
'BatchGate'
:
batch_gate
,
'BatchGate'
:
batch_gate
,
'BatchCellPreAct'
:
batch_cell_pre_act
'BatchCellPreAct'
:
batch_cell_pre_act
},
},
attrs
=
{
attrs
=
{
'use_peepholes'
:
use_peepholes
,
'use_peepholes'
:
use_peepholes
,
'cell_clip'
:
cell_clip
,
'proj_clip'
:
proj_clip
,
'is_reverse'
:
is_reverse
,
'is_reverse'
:
is_reverse
,
'gate_activation'
:
gate_activation
,
'gate_activation'
:
gate_activation
,
'cell_activation'
:
cell_activation
,
'cell_activation'
:
cell_activation
,
...
@@ -2441,7 +2473,7 @@ def pool2d(input,
...
@@ -2441,7 +2473,7 @@ def pool2d(input,
data = fluid.layers.data(
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
name='data', shape=[3, 32, 32], dtype='float32')
conv
2d = fluid.layers.pool2d(
pool
2d = fluid.layers.pool2d(
input=data,
input=data,
pool_size=2,
pool_size=2,
pool_type='max',
pool_type='max',
...
@@ -2490,6 +2522,7 @@ def pool2d(input,
...
@@ -2490,6 +2522,7 @@ def pool2d(input,
return
pool_out
return
pool_out
@
templatedoc
()
def
pool3d
(
input
,
def
pool3d
(
input
,
pool_size
=-
1
,
pool_size
=-
1
,
pool_type
=
"max"
,
pool_type
=
"max"
,
...
@@ -2501,13 +2534,19 @@ def pool3d(input,
...
@@ -2501,13 +2534,19 @@ def pool3d(input,
name
=
None
,
name
=
None
,
exclusive
=
True
):
exclusive
=
True
):
"""
"""
This function adds the operator for pooling in 3-dimensions, using the
${comment}
pooling configurations mentioned in input parameters.
Args:
Args:
input (Variable): ${input_comment}
input (Variable): The input tensor of pooling operator. The format of
pool_size (int): ${ksize_comment}
input tensor is NCDHW, where N is batch size, C is
pool_type (str): ${pooling_type_comment}
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment}
global_pooling (bool): ${global_pooling_comment}
...
@@ -2520,6 +2559,19 @@ def pool3d(input,
...
@@ -2520,6 +2559,19 @@ def pool3d(input,
Returns:
Returns:
Variable: output of pool3d layer.
Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
"""
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
raise
ValueError
(
...
@@ -2569,7 +2621,27 @@ def adaptive_pool2d(input,
...
@@ -2569,7 +2621,27 @@ def adaptive_pool2d(input,
require_index
=
False
,
require_index
=
False
,
name
=
None
):
name
=
None
):
"""
"""
${comment}
**Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &=
\\
frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args:
Args:
input (Variable): The input tensor of pooling operator. The format of
input (Variable): The input tensor of pooling operator. The format of
...
@@ -2579,8 +2651,8 @@ def adaptive_pool2d(input,
...
@@ -2579,8 +2651,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
require_index (bool): If true, the index of max pooling point
will be returned along
i
t cannot be set in average pooling type.
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
layer will be named automatically.
...
@@ -2661,18 +2733,42 @@ def adaptive_pool3d(input,
...
@@ -2661,18 +2733,42 @@ def adaptive_pool3d(input,
require_index
=
False
,
require_index
=
False
,
name
=
None
):
name
=
None
):
"""
"""
${comment}
**Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &=
\\
frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
Args:
input (Variable): The input tensor of pooling operator. The format of
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
input tensor is NC
D
HW, where N is batch size, C is
the number of channels,
H is the height of the
the number of channels,
D is the depth of the feature,
feature, and W is the width of the feature.
H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain t
wo
integers, (Depth, Height, Width).
it must contain t
hree
integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
require_index (bool): If true, the index of max pooling point
will be returned along
i
t cannot be set in average pooling type.
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
layer will be named automatically.
...
@@ -2709,7 +2805,7 @@ def adaptive_pool3d(input,
...
@@ -2709,7 +2805,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32')
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
input=data,
pool_size=[3, 3],
pool_size=[3, 3
, 3
],
pool_type='avg')
pool_type='avg')
"""
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
if
pool_type
not
in
[
"max"
,
"avg"
]:
...
@@ -5765,6 +5861,132 @@ def softmax_with_cross_entropy(logits,
...
@@ -5765,6 +5861,132 @@ def softmax_with_cross_entropy(logits,
return
loss
return
loss
def
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
,
num_true
=
1
,
remove_accidental_hits
=
True
,
use_customized_samples
=
False
,
customized_samples
=
None
,
customized_probabilities
=
None
,
seed
=
0
):
"""
**Sampled Softmax With Cross Entropy Operator.**
Cross entropy loss with sampled softmax is used as the output layer for
larger output classes extensively. This operator samples a number of samples
for all examples, and computes the softmax normalized values for each
row of the sampled tensor, after which cross-entropy loss is computed.
Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results.
For examples with T true labels (T >= 1), we assume that each true label has
a probability of 1/T. For each sample, S samples are generated using a
log uniform distribution. True labels are concatenated with these samples to
form T + S samples for each example. So, assume the shape of logits is
[N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
probability is calculated, which corresponds to the Q(y|x) in
[Jean et al., 2014](http://arxiv.org/abs/1412.2007).
Logits are sampled according to the sampled labels. Then if
remove_accidental_hits is True, if a sample[i, j] accidentally hits true
labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
make its softmax result close to zero. Then sampled logits are subtracted by
logQ(y|x), these sampled logits and re-indexed labels are used to compute
a softmax with cross entropy.
Args:
logits (Variable): The unscaled log probabilities, which is a 2-D tensor
with shape [N x K]. N is the batch_size, and K is the class number.
label (Variable): The ground truth which is a 2-D tensor. Label is a
Tensor<int64> with shape [N x T], where T is the number of true
labels per example.
num_samples (int): The number for each example, num_samples should be
less than the number of class.
num_true(int): The number of target classes per training example.
remove_accidental_hits (bool): A flag indicating whether to remove
accidental hits when sampling. If True and if a sample[i, j]
accidentally hits true labels, then the corresponding
sampled_logits[i, j] is minus by 1e20 to make its softmax result
close to zero. Default is True.
use_customized_samples (bool): Whether to use custom samples and probabities to sample
logits.
customized_samples (Variable): User defined samples, which is a 2-D tensor
with shape [N, T + S]. S is the num_samples, and T is the number of true
labels per example.
customized_probabilities (Variable): User defined probabilities of samples,
a 2-D tensor which has the same shape with customized_samples.
seed (int): The random seed for generating random number, which is used
in the process of sampling. Default is 0.
Returns:
Variable: Return the cross entropy loss which is a 2-D tensor with shape
[N x 1].
Examples:
.. code-block:: python
logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
label = fluid.layers.data(name='label', shape=[5], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.sampled_softmax_with_cross_entropy(
logits=fc, label=label, num_samples=25)
"""
helper
=
LayerHelper
(
'sample_logits'
,
**
locals
())
samples
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
probabilities
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_logits
\
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_label
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
sampled_softlabel
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'sample_logits'
,
inputs
=
{
'Logits'
:
logits
,
'Labels'
:
label
,
'CustomizedSamples'
:
customized_samples
,
'CustomizedProbabilities'
:
customized_probabilities
},
outputs
=
{
'Samples'
:
samples
,
'Probabilities'
:
probabilities
,
'SampledLabels'
:
sampled_label
,
'SampledLogits'
:
sampled_logits
},
attrs
=
{
'use_customized_samples'
:
use_customized_samples
,
'uniq'
:
True
,
'remove_accidental_hits'
:
remove_accidental_hits
,
'num_samples'
:
num_samples
,
'seed'
:
seed
})
loss
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
softmax
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'one_hot'
,
inputs
=
{
'X'
:
sampled_label
},
attrs
=
{
'depth'
:
num_samples
+
1
},
outputs
=
{
'Out'
:
sampled_softlabel
})
helper
.
append_op
(
type
=
'softmax_with_cross_entropy'
,
inputs
=
{
'Logits'
:
sampled_logits
,
'Label'
:
sampled_softlabel
},
outputs
=
{
'Softmax'
:
softmax
,
'Loss'
:
loss
},
attrs
=
{
'soft_label'
:
True
,
'ignore_index'
:
False
,
'numeric_stable_mode'
:
False
})
return
loss
/
num_true
def
smooth_l1
(
x
,
y
,
inside_weight
=
None
,
outside_weight
=
None
,
sigma
=
None
):
def
smooth_l1
(
x
,
y
,
inside_weight
=
None
,
outside_weight
=
None
,
sigma
=
None
):
"""
"""
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
cf0511f2
...
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
...
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
_uniform_random_
=
generate_layer_fn
(
'uniform_random'
)
_uniform_random_
=
generate_layer_fn
(
'uniform_random'
)
def
uniform_random
(
shape
,
dtype
=
None
,
min
=
None
,
max
=
None
,
seed
=
None
):
def
uniform_random
(
shape
,
dtype
=
'float32'
,
min
=-
1.0
,
max
=
1.0
,
seed
=
0
):
"""
This operator initializes a variable with random values sampled from a
uniform distribution. The random result is in set [min, max].
Args:
shape (list): The shape of output variable.
dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
float32, float64 etc. Default: float32.
min (float): Minimum value of uniform random. Default -1.0.
max (float): Maximun value of uniform random. Default 1.0.
seed (int): Random seed used for generating samples. 0 means use a
seed generated by the system. Note that if seed is not 0, this
operator will always generate the same random numbers every time.
Default 0.
Examples:
.. code-block:: python
result = fluid.layers.uniform_random(shape=[32, 784])
"""
locals_var
=
locals
().
keys
()
locals_var
=
locals
().
keys
()
if
not
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
not
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
dtype
=
convert_np_dtype_to_dtype_
(
dtype
)
dtype
=
convert_np_dtype_to_dtype_
(
dtype
)
...
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
...
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
return
_uniform_random_
(
**
kwargs
)
return
_uniform_random_
(
**
kwargs
)
uniform_random
.
__doc__
=
_uniform_random_
.
__doc__
+
"""
Examples:
>>> result = fluid.layers.uniform_random(shape=[32, 784])
"""
__all__
+=
[
'hard_shrink'
]
__all__
+=
[
'hard_shrink'
]
_hard_shrink_
=
generate_layer_fn
(
'hard_shrink'
)
_hard_shrink_
=
generate_layer_fn
(
'hard_shrink'
)
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
cf0511f2
...
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
...
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
ParallelExecutor
(
object
):
class
ParallelExecutor
(
object
):
"""
"""
ParallelExecutor is designed for data parallelism, which focuses on distributing
ParallelExecutor is designed for data parallelism, which focuses on distributing
...
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
...
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
# it's distributed model.
build_strategy
.
is_distribution
=
_
is_pserver_mode
(
build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
main_program
)
or
num_trainers
>
1
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
# step4: get main_program, scope, local_scopes
...
@@ -185,10 +176,13 @@ class ParallelExecutor(object):
...
@@ -185,10 +176,13 @@ class ParallelExecutor(object):
places
=
list
(
map
(
place_obj
,
self
.
_places
))
places
=
list
(
map
(
place_obj
,
self
.
_places
))
# step7: init ParallelExecutor
# step7: init ParallelExecutor
# ParallelExecutor API will be deprecated, don't support parallel graph.
self
.
_graph
=
core
.
Graph
(
main
.
desc
)
self
.
executor
=
core
.
ParallelExecutor
(
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
main
.
desc
,
places
,
persistable_vars
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
)
local_scopes
,
exec_strategy
,
build_strategy
,
self
.
_graph
)
self
.
scope
=
scope
self
.
scope
=
scope
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
浏览文件 @
cf0511f2
...
@@ -18,8 +18,8 @@ import unittest
...
@@ -18,8 +18,8 @@ import unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
scipy.special
import
expit
from
paddle.fluid.tests.unittests.test_activation_op
import
TestRelu
,
TestTanh
,
TestSqrt
,
TestAbs
from
paddle.fluid.tests.unittests.test_activation_op
import
TestRelu
,
TestTanh
,
TestSqrt
,
TestAbs
import
paddle.fluid
as
fluid
class
TestMKLDNNReluDim2
(
TestRelu
):
class
TestMKLDNNReluDim2
(
TestRelu
):
...
@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
...
@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
self
.
attrs
=
{
"use_mkldnn"
:
True
}
self
.
attrs
=
{
"use_mkldnn"
:
True
}
# Check if primitives already exist in backward
class
TestMKLDNNReluPrimitivesAlreadyExist
(
unittest
.
TestCase
):
def
__assert_close
(
self
,
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
self
.
assertTrue
(
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
),
msg
)
def
test_check_forward_backward
(
self
):
place
=
core
.
CPUPlace
()
np
.
random
.
seed
(
123
)
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
2
,
2
]).
astype
(
np
.
float32
)
out
=
np
.
abs
(
x
)
out_grad
=
np
.
random
.
random_sample
(
x
.
shape
).
astype
(
np
.
float32
)
x_grad
=
out_grad
*
np
.
sign
(
x
)
# Abs grad calculation
var_dict
=
{
'x'
:
x
,
'out'
:
out
,
'out@GRAD'
:
out_grad
,
'x@GRAD'
:
x_grad
}
var_names
=
list
(
var_dict
.
keys
())
ground_truth
=
{
name
:
var_dict
[
name
]
for
name
in
var_names
}
program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
program
):
block
=
program
.
global_block
()
for
name
in
ground_truth
:
block
.
create_var
(
name
=
name
,
dtype
=
'float32'
,
shape
=
ground_truth
[
name
].
shape
)
relu_op
=
block
.
append_op
(
type
=
"abs"
,
inputs
=
{
"X"
:
block
.
var
(
'x'
),
},
outputs
=
{
"Out"
:
block
.
var
(
'out'
)},
attrs
=
{
"use_mkldnn"
:
True
})
# Generate backward op_desc
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
relu_op
.
desc
,
set
(),
[])
grad_op_desc
=
grad_op_desc_list
[
0
]
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
grad_op_desc
)
for
var_name
in
grad_op_desc
.
output_arg_names
():
block
.
desc
.
var
(
var_name
.
encode
(
"ascii"
))
grad_op_desc
.
infer_var_type
(
block
.
desc
)
grad_op_desc
.
infer_shape
(
block
.
desc
)
for
arg
in
grad_op_desc
.
output_arg_names
():
grad_var
=
block
.
desc
.
find_var
(
arg
.
encode
(
"ascii"
))
grad_var
.
set_dtype
(
core
.
VarDesc
.
VarType
.
FP32
)
exe
=
fluid
.
Executor
(
place
)
# Do at least 2 iterations
for
i
in
range
(
2
):
out
=
exe
.
run
(
program
,
feed
=
{
name
:
var_dict
[
name
]
for
name
in
[
'x'
,
'out@GRAD'
]},
fetch_list
=
[
'x@GRAD'
])
self
.
__assert_close
(
x_grad
,
out
[
0
],
"x@GRAD"
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_base_layer.py
浏览文件 @
cf0511f2
...
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper
...
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper
class
L1
(
fluid
.
imperative
.
Layer
):
class
L1
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
prefix
):
super
(
L1
,
self
).
__init__
()
super
(
L1
,
self
).
__init__
(
prefix
)
self
.
_helper
=
LayerHelper
(
self
.
_helper
=
LayerHelper
(
'MyLayer'
,
self
.
full_name
()
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
...
@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):
...
@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):
class
L2
(
fluid
.
imperative
.
Layer
):
class
L2
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
prefix
):
super
(
L2
,
self
).
__init__
()
super
(
L2
,
self
).
__init__
(
prefix
)
self
.
layer1
=
L1
()
self
.
layer1
=
L1
(
self
.
full_name
()
)
self
.
layer2
=
L1
()
self
.
layer2
=
L1
(
self
.
full_name
()
)
def
forward
(
self
):
def
forward
(
self
):
return
self
.
layer1
()
+
self
.
layer2
()
return
self
.
layer1
()
+
self
.
layer2
()
class
L3
(
fluid
.
imperative
.
Layer
):
class
L3
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
prefix
):
super
(
L3
,
self
).
__init__
()
super
(
L3
,
self
).
__init__
(
prefix
)
self
.
layer1
=
L2
()
self
.
layer1
=
L2
(
self
.
full_name
()
)
self
.
layer2
=
L2
()
self
.
layer2
=
L2
(
self
.
full_name
()
)
def
forward
(
self
):
def
forward
(
self
):
return
self
.
layer1
()
+
self
.
layer2
()
return
self
.
layer1
()
+
self
.
layer2
()
...
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
...
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
class
TestBaseLayer
(
unittest
.
TestCase
):
class
TestBaseLayer
(
unittest
.
TestCase
):
def
test_one_level
(
self
):
def
test_one_level
(
self
):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
l
=
L1
()
l
=
L1
(
'test_one_level'
)
ret
=
l
()
ret
=
l
()
self
.
assertEqual
(
l
.
w1
.
name
,
"
MyLayer
_0.w_0"
)
self
.
assertEqual
(
l
.
w1
.
name
,
"
test_one_level/L1_0
_0.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"
MyLayer
_0.w_1"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"
test_one_level/L1_0
_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
def
test_three_level
(
self
):
def
test_three_level
(
self
):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
l
=
L3
()
l
=
L3
(
'test_three_level'
)
names
=
[
p
.
name
for
p
in
l
.
parameters
()]
ret
=
l
()
ret
=
l
()
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0_0.w_0"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0_0.w_1"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1_0.w_0"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1_0.w_1"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0_0.w_0"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
...
...
python/paddle/fluid/tests/unittests/test_imperative.py
浏览文件 @
cf0511f2
...
@@ -15,7 +15,6 @@
...
@@ -15,7 +15,6 @@
import
contextlib
import
contextlib
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
import
sys
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
core
...
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
...
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
class
MyLayer
(
fluid
.
imperative
.
Layer
):
class
MyLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
name_scope
):
super
(
MyLayer
,
self
).
__init__
()
super
(
MyLayer
,
self
).
__init__
(
name_scope
)
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
x
=
fluid
.
layers
.
relu
(
inputs
)
x
=
fluid
.
layers
.
relu
(
inputs
)
...
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
...
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
class
MLP
(
fluid
.
imperative
.
Layer
):
class
MLP
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
name_scope
):
super
(
MLP
,
self
).
__init__
()
super
(
MLP
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
3
,
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
,
fluid
.
ParamAttr
(
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc2
=
FC
(
4
,
self
.
_fc2
=
FC
(
self
.
full_name
(),
4
,
fluid
.
ParamAttr
(
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
...
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
...
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
class
SimpleRNNCell
(
fluid
.
imperative
.
Layer
):
class
SimpleRNNCell
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
step_input_size
,
hidden_size
,
output_size
,
param_attr
):
def
__init__
(
self
,
name_scope
,
step_input_size
,
hidden_size
,
output_size
,
super
(
SimpleRNNCell
,
self
).
__init__
()
param_attr
):
super
(
SimpleRNNCell
,
self
).
__init__
(
name_scope
)
self
.
step_input_size
=
step_input_size
self
.
step_input_size
=
step_input_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
output_size
=
output_size
self
.
output_size
=
output_size
...
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
...
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
class
SimpleRNN
(
fluid
.
imperative
.
Layer
):
class
SimpleRNN
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
name_scope
):
super
(
SimpleRNN
,
self
).
__init__
()
super
(
SimpleRNN
,
self
).
__init__
(
name_scope
)
self
.
seq_len
=
4
self
.
seq_len
=
4
self
.
_cell
=
SimpleRNNCell
(
self
.
_cell
=
SimpleRNNCell
(
self
.
full_name
(),
3
,
3
,
3
,
3
,
3
,
3
,
...
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
...
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
cl
=
core
.
Layer
()
cl
=
core
.
Layer
()
cl
.
forward
([])
cl
.
forward
([])
l
=
fluid
.
imperative
.
Layer
()
l
=
fluid
.
imperative
.
Layer
(
"l"
)
self
.
assertRaises
(
NotImplementedError
,
l
.
forward
,
[])
self
.
assertRaises
(
NotImplementedError
,
l
.
forward
,
[])
def
test_pylayer_func_id
(
self
):
def
test_pylayer_func_id
(
self
):
...
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
...
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
np_inp
=
np
.
array
([
1.0
,
2.0
,
-
1.0
],
dtype
=
np
.
float32
)
np_inp
=
np
.
array
([
1.0
,
2.0
,
-
1.0
],
dtype
=
np
.
float32
)
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
l
=
MyLayer
()
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
var_inp
)[
0
]
x
=
l
(
var_inp
)[
0
]
self
.
assertIsNotNone
(
x
)
self
.
assertIsNotNone
(
x
)
dy_out
=
x
.
_numpy
()
dy_out
=
x
.
_numpy
()
...
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
...
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
3
],
append_batch_size
=
False
)
name
=
"inp"
,
shape
=
[
3
],
append_batch_size
=
False
)
l
=
MyLayer
()
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
inp
)[
0
]
x
=
l
(
inp
)[
0
]
param_grads
=
fluid
.
backward
.
append_backward
(
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
...
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
...
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
np_inp
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
np_inp
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
mlp
=
MLP
()
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
var_inp
)
out
=
mlp
(
var_inp
)
dy_out
=
out
.
_numpy
()
dy_out
=
out
.
_numpy
()
out
.
_backward
()
out
.
_backward
()
...
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
...
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
mlp
=
MLP
()
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
inp
)
out
=
mlp
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
param_grads
=
fluid
.
backward
.
append_backward
(
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
...
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
...
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
params
=
mlp
.
parameters
(
True
)
params
=
mlp
.
parameters
(
True
)
self
.
assertEqual
(
"
FC
_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_0
_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"
FC
_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_0
_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"
FC_1
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_1_0
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"
FC_1
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_1_0
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
len
(
params
),
4
)
self
.
assertEqual
(
len
(
params
),
4
)
sublayers
=
mlp
.
sublayers
(
True
)
sublayers
=
mlp
.
sublayers
(
True
)
...
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
...
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
layers
.
reshape
(
var_inp
,
shape
=
[
1
,
4
,
3
])
var_inp
=
fluid
.
layers
.
reshape
(
var_inp
,
shape
=
[
1
,
4
,
3
])
simple_rnn
=
SimpleRNN
()
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
.
forward
(
var_inp
)
outs
,
pre_hiddens
=
simple_rnn
.
forward
(
var_inp
)
dy_out
=
outs
[
3
].
_numpy
()
dy_out
=
outs
[
3
].
_numpy
()
outs
[
3
].
_backward
()
outs
[
3
].
_backward
()
...
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
...
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
1
,
4
,
3
],
append_batch_size
=
False
)
name
=
"inp"
,
shape
=
[
1
,
4
,
3
],
append_batch_size
=
False
)
simple_rnn
=
SimpleRNN
()
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
(
inp
)
outs
,
pre_hiddens
=
simple_rnn
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
outs
[
3
])
param_grads
=
fluid
.
backward
.
append_backward
(
outs
[
3
])
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
...
...
python/paddle/fluid/tests/unittests/test_imperative_gan.py
浏览文件 @
cf0511f2
...
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable
...
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable
class
Discriminator
(
fluid
.
imperative
.
Layer
):
class
Discriminator
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
name_scope
):
super
(
Discriminator
,
self
).
__init__
()
super
(
Discriminator
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
s
ize
=
32
,
act
=
'elu'
,
name
=
"d_fc1"
)
self
.
_fc1
=
FC
(
s
elf
.
full_name
(),
size
=
32
,
act
=
'elu'
)
self
.
_fc2
=
FC
(
s
ize
=
1
,
name
=
"d_fc2"
)
self
.
_fc2
=
FC
(
s
elf
.
full_name
(),
size
=
1
)
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
x
=
self
.
_fc1
(
inputs
)
x
=
self
.
_fc1
(
inputs
)
...
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):
...
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):
class
Generator
(
fluid
.
imperative
.
Layer
):
class
Generator
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
def
__init__
(
self
,
name_scope
):
super
(
Generator
,
self
).
__init__
()
super
(
Generator
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
s
ize
=
64
,
act
=
'elu'
,
name
=
"g_fc1"
)
self
.
_fc1
=
FC
(
s
elf
.
full_name
(),
size
=
64
,
act
=
'elu'
)
self
.
_fc2
=
FC
(
s
ize
=
64
,
act
=
'elu'
,
name
=
"g_fc2"
)
self
.
_fc2
=
FC
(
s
elf
.
full_name
(),
size
=
64
,
act
=
'elu'
)
self
.
_fc3
=
FC
(
s
ize
=
1
,
name
=
"g_fc3"
)
self
.
_fc3
=
FC
(
s
elf
.
full_name
(),
size
=
1
)
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
x
=
self
.
_fc1
(
inputs
)
x
=
self
.
_fc1
(
inputs
)
...
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
scope
=
fluid
.
core
.
Scope
()
scope
=
fluid
.
core
.
Scope
()
with
new_program_scope
(
with
new_program_scope
(
main
=
discriminate_p
,
startup
=
startup
,
scope
=
scope
):
main
=
discriminate_p
,
startup
=
startup
,
scope
=
scope
):
discriminator
=
Discriminator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
()
generator
=
Generator
(
"g"
)
img
=
fluid
.
layers
.
data
(
img
=
fluid
.
layers
.
data
(
name
=
"img"
,
shape
=
[
2
,
1
],
append_batch_size
=
False
)
name
=
"img"
,
shape
=
[
2
,
1
],
append_batch_size
=
False
)
...
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
sgd
.
minimize
(
d_loss
)
sgd
.
minimize
(
d_loss
)
with
new_program_scope
(
main
=
generate_p
,
startup
=
startup
,
scope
=
scope
):
with
new_program_scope
(
main
=
generate_p
,
startup
=
startup
,
scope
=
scope
):
discriminator
=
Discriminator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
()
generator
=
Generator
(
"g"
)
noise
=
fluid
.
layers
.
data
(
noise
=
fluid
.
layers
.
data
(
name
=
"noise"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
name
=
"noise"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
...
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
discriminator
=
Discriminator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
()
generator
=
Generator
(
"g"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
d_real
=
discriminator
(
to_variable
(
np
.
ones
([
2
,
1
],
np
.
float32
)))
d_real
=
discriminator
(
to_variable
(
np
.
ones
([
2
,
1
],
np
.
float32
)))
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
cf0511f2
...
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope
...
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope
class
SimpleImgConvPool
(
fluid
.
imperative
.
Layer
):
class
SimpleImgConvPool
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
num_channels
,
num_channels
,
num_filters
,
num_filters
,
filter_size
,
filter_size
,
...
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
...
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
use_cudnn
=
False
,
use_cudnn
=
False
,
param_attr
=
None
,
param_attr
=
None
,
bias_attr
=
None
):
bias_attr
=
None
):
super
(
SimpleImgConvPool
,
self
).
__init__
()
super
(
SimpleImgConvPool
,
self
).
__init__
(
name_scope
)
self
.
_conv2d
=
Conv2D
(
self
.
_conv2d
=
Conv2D
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_channels
=
num_channels
,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
...
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
...
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
use_cudnn
=
use_cudnn
)
use_cudnn
=
use_cudnn
)
self
.
_pool2d
=
Pool2D
(
self
.
_pool2d
=
Pool2D
(
self
.
full_name
(),
pool_size
=
pool_size
,
pool_size
=
pool_size
,
pool_type
=
pool_type
,
pool_type
=
pool_type
,
pool_stride
=
pool_stride
,
pool_stride
=
pool_stride
,
...
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
...
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
class
MNIST
(
fluid
.
imperative
.
Layer
):
class
MNIST
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
def
__init__
(
self
,
name_scope
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
MNIST
,
self
).
__init__
()
super
(
MNIST
,
self
).
__init__
(
name_scope
)
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
1
,
20
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
full_name
(),
1
,
20
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
_simple_img_conv_pool_2
=
SimpleImgConvPool
(
self
.
_simple_img_conv_pool_2
=
SimpleImgConvPool
(
20
,
50
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
full_name
(),
20
,
50
,
5
,
2
,
2
,
act
=
"relu"
)
pool_2_shape
=
50
*
4
*
4
pool_2_shape
=
50
*
4
*
4
SIZE
=
10
SIZE
=
10
scale
=
(
2.0
/
(
pool_2_shape
**
2
*
SIZE
))
**
0.5
scale
=
(
2.0
/
(
pool_2_shape
**
2
*
SIZE
))
**
0.5
self
.
_fc
=
FC
(
10
,
self
.
_fc
=
FC
(
self
.
full_name
(),
10
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NormalInitializer
(
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
scale
)),
loc
=
0.0
,
scale
=
scale
)),
...
@@ -101,25 +105,24 @@ class MNIST(fluid.imperative.Layer):
...
@@ -101,25 +105,24 @@ class MNIST(fluid.imperative.Layer):
class
TestImperativeMnist
(
unittest
.
TestCase
):
class
TestImperativeMnist
(
unittest
.
TestCase
):
def
test_mnist_float32
(
self
):
def
test_mnist_float32
(
self
):
seed
=
90
seed
=
90
batch_num
=
2
epoch_num
=
1
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
mnist
=
MNIST
()
mnist
=
MNIST
(
"mnist"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
train_reader
=
paddle
.
batch
(
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
)
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
,
drop_last
=
True
)
dy_param_init_value
=
{}
dy_param_init_value
=
{}
for
epoch
in
range
(
epoch_num
):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
dy_x_data
=
np
.
array
(
dy_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
[
x
[
0
].
reshape
(
1
,
28
,
28
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
for
x
in
data
]).
astype
(
'float32'
)
128
,
1
)
y_data
=
np
.
array
(
[
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
128
,
1
)
img
=
to_variable
(
dy_x_data
)
img
=
to_variable
(
dy_x_data
)
label
=
to_variable
(
y_data
)
label
=
to_variable
(
y_data
)
...
@@ -128,19 +131,21 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -128,19 +131,21 @@ class TestImperativeMnist(unittest.TestCase):
cost
=
mnist
(
img
)
cost
=
mnist
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
cost
,
label
)
loss
=
fluid
.
layers
.
cross_entropy
(
cost
,
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
dy_out
=
avg_loss
.
_numpy
()
dy_out
=
avg_loss
.
_numpy
()
if
batch_id
==
0
:
if
epoch
==
0
and
batch_id
==
0
:
for
param
in
fluid
.
default_main_program
().
global_block
(
for
param
in
mnist
.
parameters
():
).
all_parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
_numpy
()
dy_param_init_value
[
param
.
name
]
=
param
.
_numpy
()
avg_loss
.
_backward
()
avg_loss
.
_backward
()
sgd
.
minimize
(
avg_loss
)
sgd
.
minimize
(
avg_loss
)
mnist
.
clear_gradients
()
mnist
.
clear_gradients
()
fluid
.
default_main_program
().
global_block
().
_clear_block
()
dy_param_value
=
{}
dy_param_value
=
{}
for
param
in
fluid
.
default_main_program
().
global_block
(
for
param
in
mnist
.
parameters
():
).
all_parameters
():
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
with
new_program_scope
():
with
new_program_scope
():
...
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
mnist
=
MNIST
()
mnist
=
MNIST
(
"mnist"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
train_reader
=
paddle
.
batch
(
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
)
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
,
drop_last
=
True
)
img
=
fluid
.
layers
.
data
(
img
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
...
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
# initialize params and fetch them
# initialize params and fetch them
static_param_init_value
=
{}
static_param_init_value
=
{}
static_param_name_list
=
[]
static_param_name_list
=
[]
for
param
in
fluid
.
default_startup_program
().
global_block
(
for
param
in
mnist
.
parameters
():
).
all_parameters
():
static_param_name_list
.
append
(
param
.
name
)
static_param_name_list
.
append
(
param
.
name
)
out
=
exe
.
run
(
fluid
.
default_startup_program
(),
out
=
exe
.
run
(
fluid
.
default_startup_program
(),
...
@@ -176,18 +180,18 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -176,18 +180,18 @@ class TestImperativeMnist(unittest.TestCase):
for
i
in
range
(
len
(
static_param_name_list
)):
for
i
in
range
(
len
(
static_param_name_list
)):
static_param_init_value
[
static_param_name_list
[
i
]]
=
out
[
i
]
static_param_init_value
[
static_param_name_list
[
i
]]
=
out
[
i
]
for
epoch
in
range
(
epoch_num
):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
static_x_data
=
np
.
array
(
static_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
[
x
[
0
].
reshape
(
1
,
28
,
28
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
for
x
in
data
]).
astype
(
'float32'
)
[
128
,
1
])
y_data
=
np
.
array
(
[
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
([
128
,
1
])
fetch_list
=
[
avg_loss
.
name
]
fetch_list
=
[
avg_loss
.
name
]
fetch_list
.
extend
(
static_param_name_list
)
fetch_list
.
extend
(
static_param_name_list
)
out
=
exe
.
run
(
fluid
.
default_main_program
(),
out
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
"pixel"
:
static_x_data
,
feed
=
{
"pixel"
:
static_x_data
,
"label"
:
y_data
},
"label"
:
y_data
},
fetch_list
=
fetch_list
)
fetch_list
=
fetch_list
)
...
@@ -195,7 +199,10 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -195,7 +199,10 @@ class TestImperativeMnist(unittest.TestCase):
static_param_value
=
{}
static_param_value
=
{}
static_out
=
out
[
0
]
static_out
=
out
[
0
]
for
i
in
range
(
1
,
len
(
out
)):
for
i
in
range
(
1
,
len
(
out
)):
static_param_value
[
static_param_name_list
[
i
-
1
]]
=
out
[
i
]
static_param_value
[
static_param_name_list
[
i
-
1
]]
=
out
[
i
]
self
.
assertTrue
(
np
.
allclose
(
dy_x_data
.
all
(),
static_x_data
.
all
()))
for
key
,
value
in
six
.
iteritems
(
static_param_init_value
):
for
key
,
value
in
six
.
iteritems
(
static_param_init_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_init_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_init_value
[
key
]))
...
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]
,
atol
=
1e-5
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
浏览文件 @
cf0511f2
...
@@ -28,12 +28,13 @@ from paddle.fluid.backward import append_backward
...
@@ -28,12 +28,13 @@ from paddle.fluid.backward import append_backward
class
SimpleLSTMRNN
(
fluid
.
imperative
.
Layer
):
class
SimpleLSTMRNN
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
hidden_size
,
hidden_size
,
num_steps
,
num_steps
,
num_layers
=
2
,
num_layers
=
2
,
init_scale
=
0.1
,
init_scale
=
0.1
,
dropout
=
None
):
dropout
=
None
):
super
(
SimpleLSTMRNN
,
self
).
__init__
()
super
(
SimpleLSTMRNN
,
self
).
__init__
(
name_scope
)
self
.
_hidden_size
=
hidden_size
self
.
_hidden_size
=
hidden_size
self
.
_num_layers
=
num_layers
self
.
_num_layers
=
num_layers
self
.
_init_scale
=
init_scale
self
.
_init_scale
=
init_scale
...
@@ -130,13 +131,14 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
...
@@ -130,13 +131,14 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
class
PtbModel
(
fluid
.
imperative
.
Layer
):
class
PtbModel
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
hidden_size
,
hidden_size
,
vocab_size
,
vocab_size
,
num_layers
=
2
,
num_layers
=
2
,
num_steps
=
20
,
num_steps
=
20
,
init_scale
=
0.1
,
init_scale
=
0.1
,
dropout
=
None
):
dropout
=
None
):
super
(
PtbModel
,
self
).
__init__
()
super
(
PtbModel
,
self
).
__init__
(
name_scope
)
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
init_scale
=
init_scale
self
.
init_scale
=
init_scale
...
@@ -146,12 +148,14 @@ class PtbModel(fluid.imperative.Layer):
...
@@ -146,12 +148,14 @@ class PtbModel(fluid.imperative.Layer):
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'PtbModel'
,
act
=
"tanh"
)
self
.
_helper
=
LayerHelper
(
'PtbModel'
,
act
=
"tanh"
)
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
full_name
(),
hidden_size
,
hidden_size
,
num_steps
,
num_steps
,
num_layers
=
num_layers
,
num_layers
=
num_layers
,
init_scale
=
init_scale
,
init_scale
=
init_scale
,
dropout
=
dropout
)
dropout
=
dropout
)
self
.
embedding
=
Embedding
(
self
.
embedding
=
Embedding
(
self
.
full_name
(),
size
=
[
vocab_size
,
hidden_size
],
size
=
[
vocab_size
,
hidden_size
],
dtype
=
'float32'
,
dtype
=
'float32'
,
is_sparse
=
False
,
is_sparse
=
False
,
...
@@ -226,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
...
@@ -226,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
# TODO: marsyang1993 Change seed to
# TODO: marsyang1993 Change seed to
ptb_model
=
PtbModel
(
ptb_model
=
PtbModel
(
"ptb_model"
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
num_layers
=
num_layers
,
num_layers
=
num_layers
,
...
@@ -265,6 +270,7 @@ class TestImperativePtbRnn(unittest.TestCase):
...
@@ -265,6 +270,7 @@ class TestImperativePtbRnn(unittest.TestCase):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
ptb_model
=
PtbModel
(
ptb_model
=
PtbModel
(
"ptb_model"
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
num_layers
=
num_layers
,
num_layers
=
num_layers
,
...
...
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
浏览文件 @
cf0511f2
...
@@ -70,15 +70,17 @@ def optimizer_setting(params):
...
@@ -70,15 +70,17 @@ def optimizer_setting(params):
class
ConvBNLayer
(
fluid
.
imperative
.
Layer
):
class
ConvBNLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
name_scope
,
num_channels
,
num_channels
,
num_filters
,
num_filters
,
filter_size
,
filter_size
,
stride
=
1
,
stride
=
1
,
groups
=
1
,
groups
=
1
,
act
=
None
):
act
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
super
(
ConvBNLayer
,
self
).
__init__
(
name_scope
)
self
.
_conv
=
Conv2D
(
self
.
_conv
=
Conv2D
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_channels
=
num_channels
,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
...
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
...
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
act
=
None
,
act
=
None
,
bias_attr
=
None
)
bias_attr
=
None
)
self
.
_batch_norm
=
BatchNorm
(
num_filters
,
act
=
act
)
self
.
_batch_norm
=
BatchNorm
(
self
.
full_name
(),
num_filters
,
act
=
act
)
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
y
=
self
.
_conv
(
inputs
)
y
=
self
.
_conv
(
inputs
)
...
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
...
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
class
BottleneckBlock
(
fluid
.
imperative
.
Layer
):
class
BottleneckBlock
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
stride
,
shortcut
=
True
):
def
__init__
(
self
,
super
(
BottleneckBlock
,
self
).
__init__
()
name_scope
,
num_channels
,
num_filters
,
stride
,
shortcut
=
True
):
super
(
BottleneckBlock
,
self
).
__init__
(
name_scope
)
self
.
conv0
=
ConvBNLayer
(
self
.
conv0
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_channels
=
num_channels
,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
1
,
filter_size
=
1
,
act
=
'relu'
)
act
=
'relu'
)
self
.
conv1
=
ConvBNLayer
(
self
.
conv1
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_filters
,
num_channels
=
num_filters
,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
3
,
filter_size
=
3
,
stride
=
stride
,
stride
=
stride
,
act
=
'relu'
)
act
=
'relu'
)
self
.
conv2
=
ConvBNLayer
(
self
.
conv2
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_filters
,
num_channels
=
num_filters
,
num_filters
=
num_filters
*
4
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
filter_size
=
1
,
...
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
...
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
if
not
shortcut
:
if
not
shortcut
:
self
.
short
=
ConvBNLayer
(
self
.
short
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_channels
=
num_channels
,
num_filters
=
num_filters
*
4
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
filter_size
=
1
,
...
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
...
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
y
=
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv2
)
y
=
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv2
)
layer_helper
=
LayerHelper
(
'elementwise_add_activation'
,
act
=
'relu'
)
layer_helper
=
LayerHelper
(
self
.
full_name
()
,
act
=
'relu'
)
return
layer_helper
.
append_activation
(
y
)
return
layer_helper
.
append_activation
(
y
)
class
ResNet
(
fluid
.
imperative
.
Layer
):
class
ResNet
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
layers
=
50
,
class_dim
=
102
):
def
__init__
(
self
,
name_scope
,
layers
=
50
,
class_dim
=
102
):
super
(
ResNet
,
self
).
__init__
()
super
(
ResNet
,
self
).
__init__
(
name_scope
)
self
.
layers
=
layers
self
.
layers
=
layers
supported_layers
=
[
50
,
101
,
152
]
supported_layers
=
[
50
,
101
,
152
]
...
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
...
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
num_filters
=
[
64
,
128
,
256
,
512
]
num_filters
=
[
64
,
128
,
256
,
512
]
self
.
conv
=
ConvBNLayer
(
self
.
conv
=
ConvBNLayer
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
self
.
full_name
(),
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
self
.
pool2d_max
=
Pool2D
(
self
.
pool2d_max
=
Pool2D
(
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
self
.
full_name
(),
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
self
.
bottleneck_block_list
=
[]
self
.
bottleneck_block_list
=
[]
num_channels
=
64
num_channels
=
64
...
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
...
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
bottleneck_block
=
self
.
add_sublayer
(
bottleneck_block
=
self
.
add_sublayer
(
'bb_%d_%d'
%
(
block
,
i
),
'bb_%d_%d'
%
(
block
,
i
),
BottleneckBlock
(
BottleneckBlock
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_channels
=
num_channels
,
num_filters
=
num_filters
[
block
],
num_filters
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
...
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
...
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
shortcut
=
True
shortcut
=
True
self
.
pool2d_avg
=
Pool2D
(
self
.
pool2d_avg
=
Pool2D
(
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
self
.
full_name
(),
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
import
math
import
math
stdv
=
1.0
/
math
.
sqrt
(
2048
*
1.0
)
stdv
=
1.0
/
math
.
sqrt
(
2048
*
1.0
)
self
.
out
=
FC
(
size
=
class_dim
,
self
.
out
=
FC
(
self
.
full_name
(),
size
=
class_dim
,
act
=
'softmax'
,
act
=
'softmax'
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
...
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
...
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
seed
=
90
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
1
batch_num
=
2
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
resnet
=
ResNet
()
resnet
=
ResNet
(
"resnet"
)
optimizer
=
optimizer_setting
(
train_parameters
)
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
import
random
import
random
...
@@ -264,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase):
...
@@ -264,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase):
optimizer
.
minimize
(
avg_loss
)
optimizer
.
minimize
(
avg_loss
)
resnet
.
clear_gradients
()
resnet
.
clear_gradients
()
fluid
.
default_main_program
().
global_block
().
_clear_block
()
dy_param_value
=
{}
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
...
@@ -275,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
...
@@ -275,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
resnet
=
ResNet
()
resnet
=
ResNet
(
"resnet"
)
optimizer
=
optimizer_setting
(
train_parameters
)
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
...
@@ -297,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase):
...
@@ -297,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase):
static_param_init_value
=
{}
static_param_init_value
=
{}
static_param_name_list
=
[]
static_param_name_list
=
[]
static_grad_name_list
=
[]
static_grad_name_list
=
[]
for
param
in
fluid
.
default_startup_program
().
global_block
(
for
param
in
resnet
.
parameters
():
).
all_parameters
():
static_param_name_list
.
append
(
param
.
name
)
static_param_name_list
.
append
(
param
.
name
)
for
param
in
fluid
.
default_main_program
().
global_block
(
for
param
in
resnet
.
parameters
():
).
all_parameters
():
if
not
param
.
stop_gradient
:
if
not
param
.
stop_gradient
:
static_grad_name_list
.
append
(
param
.
name
+
static_grad_name_list
.
append
(
param
.
name
+
core
.
grad_var_suffix
())
core
.
grad_var_suffix
())
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
cf0511f2
...
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
...
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
print
(
str
(
program
))
def
test_sampled_softmax_with_cross_entropy
(
self
):
program
=
Program
()
with
program_guard
(
program
):
logits
=
layers
.
data
(
name
=
'Logits'
,
shape
=
[
256
],
dtype
=
'float64'
)
label
=
layers
.
data
(
name
=
'Label'
,
shape
=
[
1
],
dtype
=
'int64'
)
num_samples
=
25
output
=
layers
.
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
@
decorators
.
prog_scope
()
@
decorators
.
prog_scope
()
def
test_nce
(
self
):
def
test_nce
(
self
):
window_size
=
5
window_size
=
5
...
...
python/paddle/fluid/tests/unittests/test_lstmp_op.py
浏览文件 @
cf0511f2
...
@@ -36,12 +36,14 @@ def lstmp(
...
@@ -36,12 +36,14 @@ def lstmp(
w_b
=
None
,
# 1 x 4D
w_b
=
None
,
# 1 x 4D
w_c
=
None
,
# 1 x 3D
w_c
=
None
,
# 1 x 3D
is_reverse
=
False
,
is_reverse
=
False
,
proj_clip
=
0.0
,
cell_clip
=
0.0
,
act_gate
=
None
,
act_gate
=
None
,
act_cell
=
None
,
act_cell
=
None
,
act_cand
=
None
,
act_cand
=
None
,
act_proj
=
None
):
act_proj
=
None
):
def
_step
(
x
,
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
act_gate
,
act_cell
,
act_cand
,
def
_step
(
x
,
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
proj_clip
,
cell_clip
,
act_gate
,
act_proj
):
act_
cell
,
act_cand
,
act_
proj
):
g
=
np
.
dot
(
r_pre
,
w_r
)
# 1 x 4D
g
=
np
.
dot
(
r_pre
,
w_r
)
# 1 x 4D
g
=
g
+
x
g
=
g
+
x
g
=
np
.
reshape
(
g
,
(
1
,
g
.
size
))
g
=
np
.
reshape
(
g
,
(
1
,
g
.
size
))
...
@@ -55,6 +57,17 @@ def lstmp(
...
@@ -55,6 +57,17 @@ def lstmp(
g_f
=
act_gate
(
g_f
+
w_fc
*
c_pre
)
# 1 x D
g_f
=
act_gate
(
g_f
+
w_fc
*
c_pre
)
# 1 x D
c
=
g_f
*
c_pre
+
g_i
*
act_cand
(
c
)
# 1 x D
c
=
g_f
*
c_pre
+
g_i
*
act_cand
(
c
)
# 1 x D
def
array_clip
(
a
,
clip
):
size
=
np
.
prod
(
a
.
shape
)
new_a
=
np
.
reshape
(
a
,
(
size
))
for
i
in
range
(
size
):
new_a
[
i
]
=
max
(
new_a
[
i
],
-
1.0
*
clip
)
new_a
[
i
]
=
min
(
new_a
[
i
],
clip
)
new_a
=
np
.
reshape
(
new_a
,
a
.
shape
)
return
new_a
if
cell_clip
>
0.0
:
c
=
array_clip
(
c
,
cell_clip
)
if
w_c
is
None
:
if
w_c
is
None
:
g_o
=
act_gate
(
g_o
)
# 1 x D
g_o
=
act_gate
(
g_o
)
# 1 x D
else
:
else
:
...
@@ -64,6 +77,8 @@ def lstmp(
...
@@ -64,6 +77,8 @@ def lstmp(
# projection
# projection
r
=
np
.
dot
(
h
,
w_rh
)
r
=
np
.
dot
(
h
,
w_rh
)
r
=
act_proj
(
r
)
r
=
act_proj
(
r
)
if
proj_clip
>
0.0
:
r
=
array_clip
(
r
,
proj_clip
)
return
r
,
c
return
r
,
c
def
_reverse
(
x
,
offset
):
def
_reverse
(
x
,
offset
):
...
@@ -87,13 +102,13 @@ def lstmp(
...
@@ -87,13 +102,13 @@ def lstmp(
# compute one sequence
# compute one sequence
seq_len
=
lod
[
0
][
i
]
seq_len
=
lod
[
0
][
i
]
x
=
input
[
offset
[
i
]:
offset
[
i
+
1
],
:]
x
=
input
[
offset
[
i
]:
offset
[
i
+
1
],
:]
r_pre
=
np
.
dot
(
h0
[
i
],
w_rh
)
# 1 x P
r_pre
=
h0
[
i
]
r_pre
=
act_proj
(
r_pre
)
c_pre
=
c0
[
i
]
# 1 x D
c_pre
=
c0
[
i
]
# 1 x D
for
j
in
range
(
seq_len
):
for
j
in
range
(
seq_len
):
# compute one step
# compute one step
r_pre
,
c_pre
=
_step
(
x
[
j
],
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
act_gate
,
r_pre
,
c_pre
=
_step
(
x
[
j
],
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
proj_clip
,
act_cell
,
act_cand
,
act_proj
)
cell_clip
,
act_gate
,
act_cell
,
act_cand
,
act_proj
)
projection
.
append
(
r_pre
.
flatten
())
projection
.
append
(
r_pre
.
flatten
())
cell
.
append
(
c_pre
.
flatten
())
cell
.
append
(
c_pre
.
flatten
())
...
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
...
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
x
=
np
.
random
.
normal
(
size
=
(
T
,
4
*
self
.
D
)).
astype
(
'float64'
)
x
=
np
.
random
.
normal
(
size
=
(
T
,
4
*
self
.
D
)).
astype
(
'float64'
)
if
self
.
has_initial_state
:
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
D
)).
astype
(
'float64'
)
h0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
P
)).
astype
(
'float64'
)
c0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
D
)).
astype
(
'float64'
)
c0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
D
)).
astype
(
'float64'
)
else
:
else
:
h0
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
h0
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
c0
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
c0
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
w
=
np
.
random
.
normal
(
size
=
(
self
.
P
,
4
*
self
.
D
)).
astype
(
'float64'
)
w
=
np
.
random
.
normal
(
size
=
(
self
.
P
,
4
*
self
.
D
)).
astype
(
'float64'
)
if
self
.
use_peepholes
:
if
self
.
use_peepholes
:
...
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
...
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
w_b
=
b
[:,
0
:
4
*
self
.
D
]
w_b
=
b
[:,
0
:
4
*
self
.
D
]
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
w_rh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
self
.
P
)).
astype
(
'float64'
)
w_rh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
self
.
P
)).
astype
(
'float64'
)
proj_clip
=
0.1
cell_clip
=
0.1
r
,
c
=
lstmp
(
x
,
self
.
lod
,
h0
,
c0
,
w
,
w_rh
,
w_b
,
w_c
,
self
.
is_reverse
,
r
,
c
=
lstmp
(
x
,
self
.
lod
,
h0
,
c0
,
w
,
w_rh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
proj_clip
,
cell_clip
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cand
],
ACTIVATION
[
self
.
act_proj
])
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
],
ACTIVATION
[
self
.
act_proj
])
self
.
inputs
=
{
'Input'
:
(
x
,
self
.
lod
),
'Weight'
:
w
,
'ProjWeight'
:
w_rh
}
self
.
inputs
=
{
'Input'
:
(
x
,
self
.
lod
),
'Weight'
:
w
,
'ProjWeight'
:
w_rh
}
...
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
...
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
self
.
attrs
=
{
self
.
attrs
=
{
'use_peepholes'
:
self
.
use_peepholes
,
'use_peepholes'
:
self
.
use_peepholes
,
'is_reverse'
:
self
.
is_reverse
,
'is_reverse'
:
self
.
is_reverse
,
'proj_clip'
:
proj_clip
,
'cell_clip'
:
cell_clip
,
'gate_activation'
:
self
.
act_gate
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
,
'candidate_activation'
:
self
.
act_cand
,
...
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
...
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
)
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
)
class
TestLstmpOpHasInitial
(
TestLstmpOp
):
class
TestLstmpOpHasInitial
(
TestLstmpOp
):
...
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
,
'C0'
],
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
,
'C0'
],
[
'Projection'
],
[
'Projection'
],
numeric_grad_delta
=
0.0000005
,
max_relative_error
=
1e-2
)
max_relative_error
=
1e-2
)
def
test_check_grad_ingore_bias
(
self
):
def
test_check_grad_ingore_bias
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'ProjWeight'
,
'Weight'
],
[
'Projection'
],
[
'Input'
,
'ProjWeight'
,
'Weight'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Bias'
))
no_grad_set
=
set
(
'Bias'
))
def
test_check_grad_ingore_weight
(
self
):
def
test_check_grad_ingore_weight
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
[
'Input'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Weight'
))
no_grad_set
=
set
(
'Weight'
))
def
test_check_grad_ingore_proj_weight
(
self
):
def
test_check_grad_ingore_proj_weight
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'Bias'
],
[
'Projection'
],
[
'Input'
,
'Weight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'ProjWeight'
))
no_grad_set
=
set
(
'ProjWeight'
))
def
test_check_grad_ingore_input
(
self
):
def
test_check_grad_ingore_input
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
[
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Input'
))
no_grad_set
=
set
(
'Input'
))
def
test_check_grad_ingore_h0
(
self
):
def
test_check_grad_ingore_h0
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'C0'
],
[
'Projection'
],
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'C0'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'H0'
))
no_grad_set
=
set
(
'H0'
))
def
test_check_grad_ingore_c0
(
self
):
def
test_check_grad_ingore_c0
(
self
):
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
...
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
],
[
'Projection'
],
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'C0'
))
no_grad_set
=
set
(
'C0'
))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录