Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c4187dbd
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c4187dbd
编写于
2月 27, 2019
作者:
X
xiaoli.liu@intel.com
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into dequantize-reuse
上级
70759d18
6724be2b
变更
145
显示空白变更内容
内联
并排
Showing
145 changed file
with
5373 addition
and
1557 deletion
+5373
-1557
README.md
README.md
+11
-11
README_cn.md
README_cn.md
+11
-11
cmake/operators.cmake
cmake/operators.cmake
+3
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-4
paddle/fluid/framework/block_desc.cc
paddle/fluid/framework/block_desc.cc
+14
-0
paddle/fluid/framework/block_desc.h
paddle/fluid/framework/block_desc.h
+2
-0
paddle/fluid/framework/data_layout_transform.cc
paddle/fluid/framework/data_layout_transform.cc
+7
-16
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+24
-6
paddle/fluid/framework/details/all_reduce_deps_pass.cc
paddle/fluid/framework/details/all_reduce_deps_pass.cc
+2
-2
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+5
-31
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-1
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+39
-8
paddle/fluid/framework/details/memory_optimize_helper.h
paddle/fluid/framework/details/memory_optimize_helper.h
+6
-4
paddle/fluid/framework/details/memory_optimize_helper_test.cc
...le/fluid/framework/details/memory_optimize_helper_test.cc
+4
-22
paddle/fluid/framework/details/memory_optimize_pass.cc
paddle/fluid/framework/details/memory_optimize_pass.cc
+6
-8
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+15
-3
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+4
-11
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+2
-4
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+2
-2
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+4
-5
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+2
-0
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+3
-0
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+7
-1
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
.../framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+151
-0
paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
...ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+40
-10
paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+1
-1
paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
...fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+136
-0
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+1
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+16
-2
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+35
-2
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+118
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+50
-42
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+3
-3
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+33
-8
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-5
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+2
-1
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+29
-18
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+17
-8
paddle/fluid/imperative/tracer.h
paddle/fluid/imperative/tracer.h
+6
-4
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+2
-1
paddle/fluid/operators/alloc_continuous_space_op.cc
paddle/fluid/operators/alloc_continuous_space_op.cc
+211
-0
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+1
-1
paddle/fluid/operators/beam_search_decode_op.h
paddle/fluid/operators/beam_search_decode_op.h
+1
-1
paddle/fluid/operators/benchmark/CMakeLists.txt
paddle/fluid/operators/benchmark/CMakeLists.txt
+3
-0
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+336
-0
paddle/fluid/operators/benchmark/op_tester.h
paddle/fluid/operators/benchmark/op_tester.h
+68
-0
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+188
-0
paddle/fluid/operators/benchmark/op_tester_config.h
paddle/fluid/operators/benchmark/op_tester_config.h
+60
-0
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+14
-45
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+1
-95
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+9
-24
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+40
-7
paddle/fluid/operators/cross_entropy_op.cc
paddle/fluid/operators/cross_entropy_op.cc
+39
-18
paddle/fluid/operators/data_norm_op.cc
paddle/fluid/operators/data_norm_op.cc
+0
-3
paddle/fluid/operators/detection/prior_box_op.h
paddle/fluid/operators/detection/prior_box_op.h
+12
-1
paddle/fluid/operators/detection/yolov3_loss_op.cc
paddle/fluid/operators/detection/yolov3_loss_op.cc
+23
-17
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+6
-13
paddle/fluid/operators/fake_quantize_op.cc
paddle/fluid/operators/fake_quantize_op.cc
+2
-4
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+13
-13
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+5
-3
paddle/fluid/operators/lstmp_op.cc
paddle/fluid/operators/lstmp_op.cc
+10
-11
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+67
-35
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-0
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+20
-18
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+16
-14
paddle/fluid/operators/math/detail/lstm_kernel.h
paddle/fluid/operators/math/detail/lstm_kernel.h
+49
-13
paddle/fluid/operators/math/lstm_compute.cc
paddle/fluid/operators/math/lstm_compute.cc
+5
-4
paddle/fluid/operators/math/lstm_compute.cu
paddle/fluid/operators/math/lstm_compute.cu
+6
-6
paddle/fluid/operators/math/lstm_compute.h
paddle/fluid/operators/math/lstm_compute.h
+2
-2
paddle/fluid/operators/math/sample_prob.cc
paddle/fluid/operators/math/sample_prob.cc
+26
-0
paddle/fluid/operators/math/sample_prob.cu
paddle/fluid/operators/math/sample_prob.cu
+161
-0
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+118
-0
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+8
-18
paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+11
-25
paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+1
-7
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+25
-26
paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+1
-2
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+6
-2
paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+8
-13
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+25
-25
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+11
-4
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+8
-113
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+22
-3
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+84
-54
paddle/fluid/operators/sample_logits_op.cc
paddle/fluid/operators/sample_logits_op.cc
+225
-0
paddle/fluid/operators/sample_logits_op.cu
paddle/fluid/operators/sample_logits_op.cu
+257
-0
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+245
-0
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+77
-15
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+5
-5
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+3
-3
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+2
-2
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+14
-2
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+1
-2
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-0
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+68
-0
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+49
-39
paddle/fluid/platform/mkldnn_utils.h
paddle/fluid/platform/mkldnn_utils.h
+69
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+47
-10
paddle/fluid/platform/profiler.cu
paddle/fluid/platform/profiler.cu
+10
-10
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+15
-47
paddle/fluid/platform/profiler_test.cc
paddle/fluid/platform/profiler_test.cc
+0
-1
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+4
-4
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+4
-4
paddle/fluid/pybind/ir.cc
paddle/fluid/pybind/ir.cc
+9
-7
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+2
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-3
paddle/fluid/train/demo/demo_trainer.cc
paddle/fluid/train/demo/demo_trainer.cc
+2
-2
paddle/fluid/train/test_train_recognize_digits.cc
paddle/fluid/train/test_train_recognize_digits.cc
+1
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-0
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+49
-24
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
...ddle/fluid/contrib/slim/quantization/quantization_pass.py
+68
-55
python/paddle/fluid/contrib/slim/tests/test_graph.py
python/paddle/fluid/contrib/slim/tests/test_graph.py
+3
-3
python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
...paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+26
-31
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+2
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+496
-65
python/paddle/fluid/imperative/layers.py
python/paddle/fluid/imperative/layers.py
+23
-4
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+18
-19
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+3
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+10
-9
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+249
-29
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+22
-7
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+6
-12
python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
...ddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+150
-0
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+72
-0
python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+17
-55
python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
...le/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+57
-0
python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
...e/fluid/tests/unittests/test_alloc_continuous_space_op.py
+74
-0
python/paddle/fluid/tests/unittests/test_base_layer.py
python/paddle/fluid/tests/unittests/test_base_layer.py
+22
-15
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
...d/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+2
-0
python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+35
-2
python/paddle/fluid/tests/unittests/test_imperative.py
python/paddle/fluid/tests/unittests/test_imperative.py
+25
-22
python/paddle/fluid/tests/unittests/test_imperative_gan.py
python/paddle/fluid/tests/unittests/test_imperative_gan.py
+15
-15
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+71
-64
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
...n/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+8
-2
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
...on/paddle/fluid/tests/unittests/test_imperative_resnet.py
+40
-18
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
...ddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+55
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+30
-5
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
python/paddle/fluid/tests/unittests/test_lstmp_op.py
python/paddle/fluid/tests/unittests/test_lstmp_op.py
+39
-20
python/paddle/fluid/tests/unittests/test_softmax_op.py
python/paddle/fluid/tests/unittests/test_softmax_op.py
+0
-10
python/paddle/utils/plot.py
python/paddle/utils/plot.py
+8
-7
python/paddle/utils/preprocess_img.py
python/paddle/utils/preprocess_img.py
+2
-2
未找到文件。
README.md
浏览文件 @
c4187dbd
...
...
@@ -3,8 +3,8 @@
English |
[
简体中文
](
./README_cn.md
)
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
...
...
@@ -18,7 +18,7 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### Latest PaddlePaddle Release: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### Install Latest Stable Release:
```
# Linux CPU
...
...
@@ -26,9 +26,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
...
...
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/beginners_guide/install/index_c
n.html
)
on our website.
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
en/1.3/beginners_guide/index_e
n.html
)
on our website.
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
documentation.
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
documentation.
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/user_guides/howto/training/cluster_howto
.html
)
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
en/1.3/user_guides/howto/training/multi_node_en
.html
)
You can run distributed training jobs on MPI clusters.
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/api_cn/index_c
n.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
en/1.3/api/index_e
n.html
)
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/advanced_usage/development/contribute_to_paddle/index_c
n.html
)
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
en/1.3/advanced_usage/development/contribute_to_paddle/index_e
n.html
)
We appreciate your contributions!
...
...
README_cn.md
浏览文件 @
c4187dbd
...
...
@@ -3,8 +3,8 @@
[
English
](
./README.md
)
| 简体中文
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
...
...
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的
[
版本说明
](
https://github.com/PaddlePaddle/Paddle/releases
)
### PaddlePaddle最新版本: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### PaddlePaddle最新版本: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### 安装最新稳定版本:
```
# Linux CPU
...
...
@@ -24,9 +24,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
...
...
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## 安装
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/install/index_cn.html
)
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/install/index_cn.html
)
## 文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
文档
-
[
深度学习101
](
https://github.com/PaddlePaddle/book
)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
2/user_guides/howto/training/cluster_howto
.html
)
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
3/user_guides/howto/training/multi_node
.html
)
可以在MPI集群上运行分布式训练任务
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/api_cn/index_cn.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/api_cn/index_cn.html
)
新的API支持代码更少更简洁的程序
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
欢迎您的贡献!
...
...
cmake/operators.cmake
浏览文件 @
c4187dbd
...
...
@@ -168,6 +168,9 @@ function(op_library TARGET)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(relu, MKLDNN);
\n
"
)
elseif
(
${
MKLDNN_FILE
}
STREQUAL
"conv_mkldnn_op"
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);
\n
"
)
else
()
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, MKLDNN);
\n
"
)
endif
()
...
...
paddle/fluid/API.spec
浏览文件 @
c4187dbd
...
...
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program
_or_graph
'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
...
...
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'
], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32'
, None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'
, 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None
, None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
...
...
@@ -92,7 +92,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', '
use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False
, False, None, None, None, False))
paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', '
name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW'
, False, None, None, None, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...
...
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
...
...
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
None, None, None, None
))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
'float32', -1.0, 1.0, 0
))
paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/block_desc.cc
浏览文件 @
c4187dbd
...
...
@@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
return
res
;
}
void
BlockDesc
::
Clear
()
{
// clear all ops
ops_
.
clear
();
// clear all vars which are not persistable
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
it
->
second
->
Persistable
())
{
++
it
;
}
else
{
vars_
.
erase
(
it
++
);
}
}
}
void
BlockDesc
::
Flush
()
{
for
(
auto
&
op_desc
:
ops_
)
{
op_desc
->
Flush
();
...
...
paddle/fluid/framework/block_desc.h
浏览文件 @
c4187dbd
...
...
@@ -97,6 +97,8 @@ class BlockDesc {
std
::
vector
<
OpDesc
*>
AllOps
()
const
;
void
Clear
();
size_t
OpSize
()
const
{
return
ops_
.
size
();
}
OpDesc
*
Op
(
int
idx
)
const
{
return
ops_
.
at
(
idx
).
get
();
}
...
...
paddle/fluid/framework/data_layout_transform.cc
浏览文件 @
c4187dbd
...
...
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout
=
out_layout
==
DataLayout
::
kAnyLayout
?
DataLayout
::
kNCHW
:
out_layout
;
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
expected_kernel_type
.
place_
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
std
::
vector
<
int
>
in_tz
=
paddle
::
framework
::
vectorize2int
(
in
.
dims
());
std
::
vector
<
int
>
out_tz
=
in_tz
;
...
...
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s"
,
in
.
type
());
memory
::
data_type
out_type
=
in_type
;
auto
in_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
in
.
format
());
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
ToMKLDNNFormat
(
out_layout
));
// output tensor has the same dims as input. Reorder don't change dims
out
->
Resize
(
in
.
dims
());
if
(
in_format
!=
out_format
)
{
// tempory mem pd fr out , to make reorder
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
out
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
,
out_type
);
if
(
in
.
get_mkldnn_prim_desc
()
!=
out_mem_pd
)
{
void
*
in_data
=
GetDataFromTensor
(
in
,
in_type
);
auto
out_data
=
out
->
mutable_data
(
expected_kernel_type
.
place_
,
in
.
type
());
auto
in_memory
=
memory
({{{
in_tz
},
in_type
,
in_format
},
cpu_engine
},
in_data
);
auto
out_memory
=
memory
({{{
out_tz
},
out_type
,
out_format
},
cpu_engine
},
out_data
);
auto
in_memory
=
memory
(
in
.
get_mkldnn_prim_desc
(),
in_data
);
auto
out_memory
=
memory
(
out_mem_pd
,
out_data
);
platform
::
Reorder
(
in_memory
,
out_memory
);
}
else
{
out
->
ShareDataWith
(
in
);
}
out
->
set_layout
(
out_layout
);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out
->
set_format
(
memory
::
format
::
format_undef
);
#endif
}
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
c4187dbd
...
...
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
out
.
ShareDataWith
(
input_tensor
);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto
mkldnn_fmt
=
[
&
](
int
rank
)
{
switch
(
rank
)
{
case
5
:
return
mkldnn
::
memory
::
format
::
ncdhw
;
case
4
:
return
mkldnn
::
memory
::
format
::
nchw
;
case
3
:
return
mkldnn
::
memory
::
format
::
ncw
;
case
2
:
return
mkldnn
::
memory
::
format
::
nc
;
case
1
:
return
mkldnn
::
memory
::
format
::
x
;
default:
return
mkldnn
::
memory
::
format
::
blocked
;
}
};
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in
.
dims
().
size
(),
ToMKLDNNFormat
(
lin
));
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
out
.
dims
()),
mkldnn_fmt
(
out
.
dims
().
size
()));
out
.
ShareDataWith
(
input_tensor
);
out
.
set_layout
(
DataLayout
::
kMKLDNN
);
out
.
set_format
(
out_format
);
out
.
set_mkldnn_prim_desc
(
out_mem_pd
);
#endif
}
else
{
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
...
...
paddle/fluid/framework/details/all_reduce_deps_pass.cc
浏览文件 @
c4187dbd
...
...
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std
::
unordered_map
<
std
::
string
,
int
>
vars
;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAll
OpDescs
);
auto
&
ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kStaleProgram
OpDescs
);
for
(
auto
*
op_desc
:
ops
)
{
auto
outputs
=
op_desc
->
Outputs
();
for
(
auto
&
o_it
:
outputs
)
{
...
...
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
REGISTER_PASS
(
all_reduce_deps_pass
,
paddle
::
framework
::
details
::
AllReduceDepsPass
)
.
Require
PassAttr
(
paddle
::
framework
::
details
::
kAll
OpDescs
);
.
Require
GraphAttr
(
paddle
::
framework
::
details
::
kStaleProgram
OpDescs
);
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
c4187dbd
...
...
@@ -135,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
void
AppendMultiDevPass
(
const
BuildStrategy
&
strategy
)
{
ir
::
Pass
*
multi_devices_pass
;
if
(
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"multi device parameter server mode"
;
multi_devices_pass
=
AppendPass
(
"dist_multi_devices_pass"
).
get
();
}
else
{
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
VLOG
(
3
)
<<
"multi devices collective mode with allreduce"
;
multi_devices_pass
=
AppendPass
(
"allreduce_mode_multi_devices_pass"
).
get
();
}
else
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
VLOG
(
3
)
<<
"multi deivces collective mode with reduce"
;
multi_devices_pass
=
AppendPass
(
"reduce_mode_multi_devices_pass"
).
get
();
}
else
{
PADDLE_THROW
(
"Unknown reduce strategy."
);
...
...
@@ -171,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
}
std
::
unique_ptr
<
ir
::
Graph
>
BuildStrategy
::
Apply
(
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -182,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
// Create a default one if not finalized by user.
CreatePassesFromStrategy
(
false
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
));
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
if
(
IsMultiDevPass
(
pass
->
Type
()))
{
pass
->
Erase
(
kPlaces
);
...
...
@@ -200,41 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"memory_optimize_pass"
)
{
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
const
std
::
vector
<
OpDesc
*>
*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
LOG
(
INFO
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"inplace_pass"
)
{
if
(
graph
->
Has
(
kAllOpDescs
))
{
graph
->
Erase
(
kAllOpDescs
);
}
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
if
(
!
use_cuda
)
{
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
c4187dbd
...
...
@@ -114,7 +114,7 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
const
ProgramDesc
&
main_program
,
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
c4187dbd
...
...
@@ -24,12 +24,11 @@ namespace details {
FastThreadedSSAGraphExecutor
::
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
:
strategy_
(
strategy
),
local_scopes_
(
local_scopes
),
places_
(
places
),
graph_
(
std
::
move
(
graph
)
),
graph_
(
graph
),
pool_
(
strategy
.
num_threads_
),
prepare_pool_
(
1
),
// add one more thread for generate op_deps
fetch_ctxs_
(
places
)
{
...
...
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
}
if
(
exception_
.
IsCaught
())
{
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_
.
ReThrow
();
}
}
num_complete
+=
num_comp
;
}
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetches
;
}
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
c4187dbd
...
...
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
FastThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
const
ir
::
Graph
&
Graph
()
const
override
;
...
...
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unordered_map
<
OpHandleBase
*
,
int
>
op_deps_
;
std
::
vector
<
OpHandleBase
*>
bootstrap_ops_
;
...
...
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
c4187dbd
...
...
@@ -33,10 +33,10 @@ namespace details {
using
paddle
::
framework
::
VarDesc
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
k
All
OpDescs
),
"Graph has no attribute of k
All
OpDescs."
);
PADDLE_ENFORCE
(
graph
.
Has
(
k
StaleProgram
OpDescs
),
"Graph has no attribute of k
StaleProgram
OpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
k
All
OpDescs
);
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
k
StaleProgram
OpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
...
...
@@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
}
}
}
for
(
auto
*
op
:
ops_
)
{
unlived_vars_
[
op
]
=
std
::
set
<
std
::
string
>
();
for
(
auto
&
var
:
this
->
LiveIn
(
op
))
{
if
(
!
this
->
LiveOut
(
op
).
count
(
var
))
{
unlived_vars_
[
op
].
insert
(
var
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
std
::
vector
<
bool
>
need_update
(
ops_
.
size
(),
false
);
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
...
...
@@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
need_update
[
i
]
=
true
;
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
need_update
[
i
]
=
true
;
}
}
for
(
size_t
i
=
begin_idx
;
i
<
ops_
.
size
();
++
i
)
{
if
(
!
need_update
[
i
])
continue
;
auto
*
op
=
ops_
[
i
];
for
(
auto
&
var
:
this
->
LiveIn
(
op
))
{
if
(
!
this
->
LiveOut
(
op
).
count
(
var
))
{
unlived_vars_
[
op
].
insert
(
var
);
}
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
...
...
@@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
...
...
@@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
const
std
::
set
<
std
::
string
>
&
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
string
::
Sprintf
(
"Expect %s in use, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>&
ControlFlowGraph
::
Unlived
(
ir
::
Node
*
op
)
const
{
auto
it
=
unlived_vars_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
unlived_vars_
.
end
(),
string
::
Sprintf
(
"Expect %s in unlived_set, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
const
std
::
vector
<
ir
::
Node
*>
&
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
...
...
paddle/fluid/framework/details/memory_optimize_helper.h
浏览文件 @
c4187dbd
...
...
@@ -92,10 +92,11 @@ class ControlFlowGraph {
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
const
std
::
set
<
std
::
string
>&
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>&
Unlived
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>&
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
...
...
@@ -117,6 +118,7 @@ class ControlFlowGraph {
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
unlived_vars_
;
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
...
...
paddle/fluid/framework/details/memory_optimize_helper_test.cc
浏览文件 @
c4187dbd
...
...
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
...
...
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
...
...
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
...
...
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
...
...
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
...
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
...
...
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
return
ret
;
};
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
...
...
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
std
::
vector
<
OpDesc
*>
op_descs
=
graph
.
OriginProgram
().
Block
(
0
).
AllOps
();
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
...
...
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
...
...
paddle/fluid/framework/details/memory_optimize_pass.cc
浏览文件 @
c4187dbd
...
...
@@ -118,8 +118,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
// fill the pool
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
if
(
cfg_
->
LiveOut
(
op
).
count
(
var
)
==
0
)
{
for
(
auto
&
var
:
cfg_
->
Unlived
(
op
))
{
ir
::
Node
*
var_node
=
cfg_
->
GetNodeByName
(
var
,
op
);
if
(
var_node
==
nullptr
||
var_node
->
IsCtrlVar
())
continue
;
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
...
...
@@ -127,7 +126,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
}
}
graph
->
ResolveHazard
(
var_nodes_
);
return
graph
;
...
...
@@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
REGISTER_PASS
(
memory_optimize_pass
,
paddle
::
framework
::
details
::
MemoryOptimizePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
k
All
OpDescs
);
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
k
StaleProgram
OpDescs
);
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
c4187dbd
...
...
@@ -937,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
}
void
DistSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
if
(
need_broadcast_var_
||
(
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
))
{
// broad cast received parameters when training in parameter server mode.
if
(
need_broadcast_var_
)
{
// There are 4 conditions:
// 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
// Need to broadcast received parameters to other GPU.
// 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
// broadcast received parameters to other GPU.
// 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
// broadcast received parameters to other scope.
// 4. CPU && Reduce: because all parameters share the same memory, did not
// broadcast received parameters.
if
(
!
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
;
}
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
}
else
{
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
c4187dbd
...
...
@@ -20,8 +20,7 @@ namespace framework {
namespace
details
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
{
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
)
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
graphs
.
reserve
(
places_
.
size
());
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
...
...
@@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
framework
::
ProgramDesc
&
main_prog
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
:
strategy_
(
std
::
move
(
strategy
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
places_
(
std
::
move
(
places
)),
main_prog_
(
main_prog
),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_
(
SeparateMultiDevicesGraph
(
std
::
move
(
graph
)
))
{
graphs_
(
SeparateMultiDevicesGraph
(
graph
))
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
auto
seq_allreduce_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"all_reduce_deps_pass"
);
seq_allreduce_pass
->
Erase
(
details
::
kAllOpDescs
);
seq_allreduce_pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_prog_
.
Block
(
0
).
AllOps
()));
for
(
size_t
i
=
0
;
i
<
graphs_
.
size
();
++
i
)
{
graphs_
[
i
]
=
seq_allreduce_pass
->
Apply
(
std
::
move
(
graphs_
[
i
]));
}
...
...
@@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<<
" to run the operators of the graph on each device."
;
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
executors_
.
emplace_back
(
new
details
::
ThreadedSSAGraphExecutor
(
strategy_
,
local_scopes_
,
{
places_
[
i
]},
std
::
move
(
graphs_
.
at
(
i
)
)));
strategy_
,
local_scopes_
,
{
places_
[
i
]},
graphs_
.
at
(
i
).
get
(
)));
}
}
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
浏览文件 @
c4187dbd
...
...
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
framework
::
ProgramDesc
&
main_prog
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
~
ParallelSSAGraphExecutor
()
final
=
default
;
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
...
...
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
private:
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
{
nullptr
};
std
::
vector
<
platform
::
Place
>
places_
;
framework
::
ProgramDesc
main_prog_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
std
::
vector
<
std
::
unique_ptr
<
details
::
ThreadedSSAGraphExecutor
>>
executors_
;
...
...
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
c4187dbd
...
...
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
static
std
::
unordered_set
<
std
::
string
>
skip_dist_ops
{
"send"
,
"recv"
,
"send_barrier"
,
"fetch_barrier"
};
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAll
OpDescs
);
auto
&
ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kStaleProgram
OpDescs
);
std
::
vector
<
ir
::
Node
*>
op_node_list
;
op_node_list
.
reserve
(
ops
.
size
());
...
...
@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
REGISTER_PASS
(
sequential_execution_pass
,
paddle
::
framework
::
details
::
SequentialExecutionPass
)
.
Require
PassAttr
(
paddle
::
framework
::
details
::
kAll
OpDescs
);
.
Require
GraphAttr
(
paddle
::
framework
::
details
::
kStaleProgram
OpDescs
);
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
c4187dbd
...
...
@@ -23,9 +23,8 @@ namespace framework {
namespace
details
{
ThreadedSSAGraphExecutor
::
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
graph_
(
std
::
move
(
graph
)),
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
:
graph_
(
graph
),
pool_
(
strategy
.
num_threads_
>=
2
?
new
::
ThreadPool
(
strategy
.
num_threads_
)
:
nullptr
),
local_scopes_
(
local_scopes
),
...
...
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
}
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
exception_holder_
.
ReThrow
();
}
else
{
continue
;
...
...
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
()
,
&
fetch_ops
);
ClearFetchOp
(
graph_
,
&
fetch_ops
);
return
fetch_data
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
c4187dbd
...
...
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
ir
::
Graph
*
graph
);
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
...
...
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details
::
OpHandleBase
*
op
);
private:
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
ir
::
Graph
*
graph_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
c4187dbd
...
...
@@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
if
(
WITH_MKLDNN
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
cc_test
(
test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
cc_test
(
test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass
)
cc_test
(
test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass
)
endif
()
paddle/fluid/framework/ir/graph.cc
浏览文件 @
c4187dbd
...
...
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
var
->
inputs
.
push_back
(
node
);
}
}
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
program
.
Block
(
0
).
AllOps
()));
return
var_nodes
;
}
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
c4187dbd
...
...
@@ -31,7 +31,7 @@ namespace details {
// This attr is not recommended, because the graph should not dependence
// the program once it is built.
constexpr
char
k
AllOpDescs
[]
=
"all
_op_descs"
;
constexpr
char
k
StaleProgramOpDescs
[]
=
"stale_program
_op_descs"
;
}
// namespace details
namespace
ir
{
...
...
@@ -195,6 +195,12 @@ class Graph {
return
nullptr
;
}
// Returns reference to the original program.
// WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram.
const
ProgramDesc
&
OriginProgram
()
const
{
return
program_
;
}
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
...
...
paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
0 → 100644
浏览文件 @
c4187dbd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
if
(
type
==
"conv2d"
)
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
op
->
SetAttr
(
"name"
,
name
);
op
->
SetInput
(
"Input"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Filter"
,
{
inputs
[
1
]});
if
(
inputs
.
size
()
>
2
)
op
->
SetInput
(
"Bias"
,
{
inputs
[
2
]});
else
op
->
SetInput
(
"Bias"
,
{});
}
else
if
(
type
==
"elementwise_add"
)
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Y"
,
{
inputs
[
1
]});
}
op
->
SetOutput
(
"Out"
,
outputs
);
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
// (c, weights)->conv->f
// (f)->elementwise_add->g
ProgramDesc
BuildProgramDesc
(
bool
convWithExistingBias
)
{
ProgramDesc
prog
;
std
::
vector
<
std
::
string
>
nodes
{
"c"
,
"weights"
,
"f"
,
"eltwise_bias"
,
"g"
};
if
(
convWithExistingBias
)
nodes
.
push_back
(
"conv_bias"
);
for
(
auto
&
v
:
nodes
)
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
if
(
v
==
"weights"
||
v
==
"conv_bias"
||
v
==
"eltwise_bias"
)
{
var
->
SetPersistable
(
true
);
}
}
// conv+bias, both with MKL-DNN
if
(
convWithExistingBias
)
{
SetOp
(
&
prog
,
"conv2d"
,
"conv"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights"
,
"conv_bias"
}),
std
::
vector
<
std
::
string
>
({
"f"
}));
}
else
{
SetOp
(
&
prog
,
"conv2d"
,
"conv"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights"
}),
std
::
vector
<
std
::
string
>
({
"f"
}));
}
SetOp
(
&
prog
,
"elementwise_add"
,
"eltwise"
,
std
::
vector
<
std
::
string
>
({
"f"
,
"eltwise_bias"
}),
std
::
vector
<
std
::
string
>
({
"g"
}));
return
prog
;
}
void
InitTensorHolder
(
Scope
*
scope
,
const
paddle
::
platform
::
Place
&
place
,
const
char
*
var_name
)
{
auto
x
=
scope
->
Var
(
var_name
);
auto
tensor
=
x
->
GetMutable
<
LoDTensor
>
();
tensor
->
mutable_data
(
place
,
proto
::
VarType
::
FP32
,
::
paddle
::
memory
::
Allocator
::
kDefault
,
1
);
}
void
MainTest
(
bool
convWithExistingBias
)
{
auto
prog
=
BuildProgramDesc
(
convWithExistingBias
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
auto
place
=
paddle
::
platform
::
CPUPlace
();
NaiveExecutor
exe
{
place
};
Scope
scope
;
// Init scope, as it is used in pass
exe
.
CreateVariables
(
prog
,
0
,
true
,
&
scope
);
if
(
convWithExistingBias
)
{
InitTensorHolder
(
&
scope
,
place
,
"conv_bias"
);
InitTensorHolder
(
&
scope
,
place
,
"eltwise_bias"
);
}
graph
->
Set
(
kParamScopeAttr
,
new
framework
::
Scope
*
(
&
scope
));
auto
pass
=
PassRegistry
::
Instance
().
Get
(
"conv_bias_mkldnn_fuse_pass"
);
int
original_nodes_num
=
graph
->
Nodes
().
size
();
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
int
current_nodes_num
=
graph
->
Nodes
().
size
();
// Remove 3 Nodes: Conv, Bias, conv_out
// Add 1 Node: ConvBias
EXPECT_EQ
(
original_nodes_num
-
2
,
current_nodes_num
);
// Assert conv_bias op in newly generated graph
int
conv_bias_count
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
->
Type
()
==
"conv2d"
)
{
auto
*
op
=
node
->
Op
();
ASSERT_TRUE
(
op
->
HasAttr
(
"use_mkldnn"
));
EXPECT_TRUE
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"use_mkldnn"
)));
// check if "conv" convolution is fused
auto
op_name
=
boost
::
get
<
std
::
string
>
(
op
->
GetAttr
(
"name"
));
if
(
op_name
==
"conv"
)
{
auto
input_names
=
op
->
InputNames
();
ASSERT_TRUE
(
std
::
find
(
input_names
.
begin
(),
input_names
.
end
(),
"Bias"
)
!=
input_names
.
end
());
auto
bias
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
op
->
Input
(
"Bias"
));
if
(
bias
.
size
())
{
++
conv_bias_count
;
}
}
}
}
EXPECT_EQ
(
conv_bias_count
,
1
);
}
TEST
(
ConvBiasFusePass
,
bias_free_conv
)
{
MainTest
(
false
);
}
TEST
(
ConvBiasFusePass
,
conv_with_existing_bias
)
{
MainTest
(
true
);
}
TEST
(
ConvBiasFusePass
,
conv3d
)
{
Conv3DBiasFusePass
pass
;
ASSERT_TRUE
(
pass
.
is_conv3d
());
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
conv_bias_mkldnn_fuse_pass
);
paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
浏览文件 @
c4187dbd
...
...
@@ -44,10 +44,14 @@ struct TestIsReachable {
using
func
=
std
::
function
<
bool
(
const
std
::
string
&
,
const
std
::
string
&
)
>
;
auto
operator
()(
const
std
::
unique_ptr
<
ir
::
Graph
>&
graph
)
->
func
{
auto
find_node
=
[](
const
std
::
unique_ptr
<
ir
::
Graph
>&
graph
,
auto
hash
=
[](
const
Node
*
node
)
->
std
::
string
{
return
node
->
Name
()
+
std
::
to_string
(
node
->
id
());
};
auto
find_node
=
[
&
](
const
std
::
unique_ptr
<
ir
::
Graph
>&
graph
,
const
std
::
string
&
name
)
->
Node
*
{
for
(
auto
&
node
:
GraphTraits
::
DFS
(
*
graph
))
{
if
(
name
==
node
.
Name
(
))
{
if
(
name
==
hash
(
&
node
))
{
return
&
node
;
}
}
...
...
@@ -55,13 +59,17 @@ struct TestIsReachable {
return
nullptr
;
};
return
[
&
](
std
::
string
from
,
const
std
::
string
to
)
->
bool
{
// update the from and to strings to hashed equivs in loop from graph traits
return
[
&
](
std
::
string
from
,
std
::
string
to
)
->
bool
{
if
(
from
==
to
)
return
true
;
std
::
map
<
std
::
string
,
bool
>
visited
;
for
(
auto
&
node
:
GraphTraits
::
DFS
(
*
graph
))
{
visited
[
node
.
Name
()]
=
false
;
auto
hashed
=
hash
(
&
node
);
if
(
node
.
Name
()
==
from
)
from
=
hashed
;
if
(
node
.
Name
()
==
to
)
to
=
hashed
;
visited
[
hashed
]
=
false
;
}
visited
[
from
]
=
true
;
...
...
@@ -72,15 +80,15 @@ struct TestIsReachable {
while
(
!
queue
.
empty
())
{
auto
cur
=
find_node
(
graph
,
queue
.
front
());
queue
.
pop_front
();
if
(
cur
==
nullptr
)
return
false
;
for
(
auto
n
:
cur
->
outputs
)
{
if
(
n
->
Name
()
==
to
)
return
true
;
auto
hashed_name
=
hash
(
n
);
if
(
hashed_name
==
to
)
return
true
;
if
(
!
visited
[
n
->
Name
()
])
{
visited
[
n
->
Name
()
]
=
true
;
queue
.
push_back
(
n
->
Name
()
);
if
(
!
visited
[
hashed_name
])
{
visited
[
hashed_name
]
=
true
;
queue
.
push_back
(
hashed_name
);
}
}
}
...
...
@@ -166,6 +174,28 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
RunPassAndAssert
(
&
prog
,
"a"
,
"relu"
,
1
);
}
TEST
(
ConvElementwiseAddMKLDNNFusePass
,
ConvolutionProjectionAsYWithElementwiseAddRelu
)
{
auto
prog
=
BuildProgramDesc
({
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"f"
},
{
"bias"
,
"weights"
,
"bias2"
,
"weights2"
});
SetOp
(
&
prog
,
"sigmoid"
,
{{
"X"
,
"a"
}},
{
"Out"
,
"b"
});
// right branch
SetOp
(
&
prog
,
"conv2d"
,
{{
"Input"
,
"b"
},
{
"Bias"
,
"bias"
},
{
"Filter"
,
"weights"
}},
{
"Output"
,
"c"
});
// left branch
SetOp
(
&
prog
,
"conv2d"
,
{{
"Input"
,
"a"
},
{
"Bias"
,
"bias2"
},
{
"Filter"
,
"weights2"
}},
{
"Output"
,
"f"
});
SetOp
(
&
prog
,
"elementwise_add"
,
{{
"X"
,
"f"
},
{
"Y"
,
"c"
}},
{
"Out"
,
"d"
});
SetOp
(
&
prog
,
"relu"
,
{{
"X"
,
"d"
}},
{
"Out"
,
"e"
});
RunPassAndAssert
(
&
prog
,
"a"
,
"relu"
,
2
);
}
TEST
(
ConvElementwiseAddMKLDNNFusePass
,
ConvolutionAsYWithElementwiseAddReluNoBias
)
{
auto
prog
=
BuildProgramDesc
({
"a"
,
"b"
,
"c"
,
"d"
,
"e"
},
{
"weights"
});
...
...
paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
浏览文件 @
c4187dbd
...
...
@@ -21,7 +21,7 @@ namespace ir {
std
::
unique_ptr
<
ir
::
Graph
>
MKLDNNPlacementPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
VLOG
(
3
)
<<
"Aplies MKL-DNN placement strategy."
;
VLOG
(
3
)
<<
"Ap
p
lies MKL-DNN placement strategy."
;
const
auto
&
op_types_list
=
Get
<
std
::
unordered_set
<
std
::
string
>>
(
"mkldnn_enabled_op_types"
);
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
...
...
paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
0 → 100644
浏览文件 @
c4187dbd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp>
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
,
boost
::
tribool
use_mkldnn
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
if
(
!
boost
::
indeterminate
(
use_mkldnn
))
op
->
SetAttr
(
"use_mkldnn"
,
use_mkldnn
);
if
(
type
==
"conv2d"
)
{
op
->
SetAttr
(
"name"
,
name
);
op
->
SetInput
(
"Input"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Filter"
,
{
inputs
[
1
]});
op
->
SetInput
(
"Bias"
,
{
inputs
[
2
]});
}
else
if
(
type
==
"relu"
)
{
op
->
SetInput
(
"X"
,
inputs
);
}
else
if
(
type
==
"concat"
)
{
op
->
SetAttr
(
"axis"
,
1
);
op
->
SetInput
(
"X"
,
{
inputs
[
0
],
inputs
[
1
]});
}
else
if
(
type
==
"pool2d"
)
{
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
}
else
{
FAIL
()
<<
"Unexpected operator type."
;
}
op
->
SetOutput
(
"Out"
,
{
outputs
[
0
]});
}
// operator use_mkldnn
// ---------------------------------------
// (a,b)->concat->c none
// (c,weights,bias)->conv->f none
// f->relu->g false
// g->pool->h false
// (h,weights2,bias2)->conv->k true
// k->relu->l true
ProgramDesc
BuildProgramDesc
()
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
({
"a"
,
"b"
,
"c"
,
"weights"
,
"bias"
,
"f"
,
"g"
,
"h"
,
"weights2"
,
"bias2"
,
"k"
,
"l"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
SELECTED_ROWS
);
if
(
v
==
"weights"
||
v
==
"bias"
)
{
var
->
SetPersistable
(
true
);
}
}
SetOp
(
&
prog
,
"concat"
,
"concat1"
,
std
::
vector
<
std
::
string
>
({
"a"
,
"b"
}),
std
::
vector
<
std
::
string
>
({
"c"
}),
boost
::
indeterminate
);
SetOp
(
&
prog
,
"conv2d"
,
"conv1"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights"
,
"bias"
}),
std
::
vector
<
std
::
string
>
({
"f"
}),
boost
::
indeterminate
);
SetOp
(
&
prog
,
"relu"
,
"relu1"
,
std
::
vector
<
std
::
string
>
({
"f"
}),
std
::
vector
<
std
::
string
>
({
"g"
}),
false
);
SetOp
(
&
prog
,
"pool2d"
,
"pool1"
,
std
::
vector
<
std
::
string
>
({
"g"
}),
std
::
vector
<
std
::
string
>
({
"h"
}),
false
);
SetOp
(
&
prog
,
"conv2d"
,
"conv2"
,
std
::
vector
<
std
::
string
>
({
"h"
,
"weights2"
,
"bias2"
}),
std
::
vector
<
std
::
string
>
({
"k"
}),
true
);
SetOp
(
&
prog
,
"relu"
,
"relu2"
,
std
::
vector
<
std
::
string
>
({
"k"
}),
std
::
vector
<
std
::
string
>
({
"l"
}),
true
);
return
prog
;
}
void
MainTest
(
std
::
initializer_list
<
std
::
string
>
mkldnn_enabled_op_types
,
unsigned
expected_use_mkldnn_true_count
)
{
auto
prog
=
BuildProgramDesc
();
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
auto
pass
=
PassRegistry
::
Instance
().
Get
(
"mkldnn_placement_pass"
);
pass
->
Set
(
"mkldnn_enabled_op_types"
,
new
std
::
unordered_set
<
std
::
string
>
(
mkldnn_enabled_op_types
));
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
unsigned
use_mkldnn_true_count
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
())
{
auto
*
op
=
node
->
Op
();
if
(
op
->
HasAttr
(
"use_mkldnn"
)
&&
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"use_mkldnn"
)))
{
++
use_mkldnn_true_count
;
}
}
}
EXPECT_EQ
(
use_mkldnn_true_count
,
expected_use_mkldnn_true_count
);
}
TEST
(
MKLDNNPlacementPass
,
enable_conv_relu
)
{
// 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
MainTest
({
"conv2d"
,
"relu"
},
3
);
}
TEST
(
MKLDNNPlacementPass
,
enable_relu_pool
)
{
// 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
MainTest
({
"relu"
,
"pool2d"
},
4
);
}
TEST
(
MKLDNNPlacementPass
,
enable_all
)
{
// 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
MainTest
({},
4
);
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
mkldnn_placement_pass
);
paddle/fluid/framework/op_registry.h
浏览文件 @
c4187dbd
...
...
@@ -290,7 +290,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
"USE_OP_DEVICE_KERNEL must be in global namespace"); \
extern int \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##
DEFAULT_TYPE
##_ =
/* NOLINT */
\
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##
customized_name
##_ =
/* NOLINT */
\
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
c4187dbd
...
...
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
}
std
::
vector
<
KernelConfig
>*
OperatorWithKernel
::
GetKernelConfig
(
const
OpKernelType
&
key
)
const
{
auto
config_iter
=
kernel_configs_map_
.
find
(
key
);
std
::
vector
<
KernelConfig
>*
kernel_configs
=
nullptr
;
if
(
config_iter
!=
kernel_configs_map_
.
end
())
{
kernel_configs
=
&
(
config_iter
->
second
);
}
return
kernel_configs
;
}
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeContext
ctx
(
Inputs
(),
Outputs
(),
scope
);
...
...
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
));
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString
(
expected_kernel_key
));
}
std
::
vector
<
KernelConfig
>*
kernel_configs
=
GetKernelConfig
(
expected_kernel_key
);
// do data transformScope &transfer_scope;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
auto
*
transfer_scope
=
...
...
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
));
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
,
kernel_configs
));
if
(
!
transfered_inplace_vars
.
empty
())
{
// there is inplace variable has been transfered.
...
...
paddle/fluid/framework/operator.h
浏览文件 @
c4187dbd
...
...
@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -184,12 +185,30 @@ class OperatorBase {
const
platform
::
Place
&
place
)
const
=
0
;
};
#ifdef PADDLE_WITH_CUDA
using
KernelConfig
=
boost
::
variant
<
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>>
;
#else
using
KernelConfig
=
boost
::
variant
<
boost
::
blank
>
;
#endif
using
OpKernelConfigsMap
=
std
::
unordered_map
<
OpKernelType
,
std
::
vector
<
KernelConfig
>
,
OpKernelType
::
Hash
>
;
class
ExecutionContext
{
public:
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
,
const
RuntimeContext
&
ctx
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
)
{}
const
RuntimeContext
&
ctx
,
std
::
vector
<
KernelConfig
>*
configs
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
),
kernel_configs_
(
configs
)
{}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
...
...
@@ -398,11 +417,20 @@ class ExecutionContext {
return
temp_tensor
;
}
template
<
typename
T
>
T
&
GetKernelConfig
(
int
idx
)
const
{
PADDLE_ENFORCE
(
kernel_configs_
&&
kernel_configs_
->
size
()
>
idx
,
"%s selected kernel doesn't have kernel config %lu <= %d"
,
op_
.
Type
().
c_str
(),
kernel_configs_
->
size
(),
idx
);
return
*
boost
::
get
<
std
::
shared_ptr
<
T
>>
(
kernel_configs_
->
at
(
idx
));
}
private:
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
platform
::
DeviceContext
&
device_context_
;
const
RuntimeContext
&
ctx_
;
mutable
std
::
vector
<
KernelConfig
>*
kernel_configs_
;
};
template
<>
...
...
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
std
::
vector
<
KernelConfig
>*
GetKernelConfig
(
const
OpKernelType
&
key
)
const
;
protected:
virtual
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
...
...
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
exec_scope
)
const
;
protected:
mutable
OpKernelConfigsMap
kernel_configs_map_
;
};
extern
bool
OpSupportGPU
(
const
std
::
string
&
op_type
);
...
...
paddle/fluid/framework/operator_kernel_configs.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <unordered_map>
#include <vector>
namespace
paddle
{
namespace
framework
{
// Not thread-safe. Should be owned per-kernel.
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
framework
::
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
c4187dbd
...
...
@@ -184,9 +184,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
ParallelExecutor
::
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
...
...
@@ -216,11 +217,13 @@ ParallelExecutor::ParallelExecutor(
}
}
std
::
unique_ptr
<
ir
::
Graph
>
temp_owned_graph
(
graph
);
// FIXME(Yancey1989): parallel graph mode get better performance
// in GPU allreduce distributed training. Need an elegant way to
// choice the execution strategy.
build_strategy
.
enable_parallel_graph_
=
EnableParallelGraphExecution
(
main_program
,
exec_strategy
,
build_strategy
);
build_strategy
.
enable_parallel_graph_
=
EnableParallelGraphExecution
(
*
temp_owned_graph
,
exec_strategy
,
build_strategy
);
if
(
build_strategy
.
enable_parallel_graph_
)
VLOG
(
0
)
<<
"The Executor would execute the graph by ParallelGraph "
"Execution which can get better performance,"
...
...
@@ -254,26 +257,32 @@ ParallelExecutor::ParallelExecutor(
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevices
(
bcast_vars
);
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
std
::
unique_ptr
<
ir
::
Graph
>
graph
;
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
),
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
#else
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
temp_owned_graph
=
build_strategy
.
Apply
(
std
::
move
(
temp_owned_graph
),
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
VLOG
(
10
)
<<
"Eager Deletion Threshold "
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
if
(
max_memory_size
>=
0
)
{
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
static_cast
<
size_t
>
(
max_memory_size
));
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
temp_owned_graph
),
static_cast
<
size_t
>
(
max_memory_size
))
.
release
();
}
else
{
graph
=
temp_owned_graph
.
release
();
}
// Step 3. Create vars in each scope. Passes may also create new vars.
...
...
@@ -308,8 +317,7 @@ ParallelExecutor::ParallelExecutor(
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
member_
->
executor_
.
reset
(
new
details
::
ParallelSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
main_program
,
std
::
move
(
graph
)));
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
#else
PADDLE_THROW
(
"Paddle should be compiled with CUDA for ParallelGraph Execution."
);
...
...
@@ -317,12 +325,10 @@ ParallelExecutor::ParallelExecutor(
}
else
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
}
else
{
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
graph
));
}
}
...
...
@@ -452,26 +458,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
bool
ParallelExecutor
::
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
{
if
(
!
FLAGS_enable_parallel_graph
)
return
false
;
bool
enable_parallel_graph
=
true
;
for
(
ir
::
Node
*
node
:
graph
.
Nodes
())
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
for
(
auto
&
var_desc
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
if
(
node
->
Var
()
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
enable_parallel_graph
=
false
;
break
;
}
}
}
else
if
(
node
->
IsOp
()
&&
node
->
Op
())
{
// TODO(Yancey1989): support pserver mode
for
(
auto
&
op_desc
:
main_program
.
Block
(
0
).
AllOps
())
{
if
(
op_desc
->
Type
()
==
"send"
||
op_desc
->
Type
()
==
"recv"
)
{
if
(
node
->
Op
()
->
Type
()
==
"send"
||
node
->
Op
()
->
Type
()
==
"recv"
)
{
enable_parallel_graph
=
false
;
break
;
}
}
}
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
...
...
@@ -481,13 +496,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
return
enable_parallel_graph
;
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
delete
member_
;
}
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
c4187dbd
...
...
@@ -46,11 +46,11 @@ class ParallelExecutor {
public:
explicit
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
);
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
);
~
ParallelExecutor
();
...
...
@@ -71,7 +71,7 @@ class ParallelExecutor {
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
bool
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
bool
EnableParallelGraphExecution
(
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
;
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
c4187dbd
...
...
@@ -27,6 +27,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -37,10 +41,34 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN
public:
inline
mkldnn
::
memory
::
format
format
()
const
{
return
format_
;
}
// TODO(jczaja): This is depracted and will be removed
inline
mkldnn
::
memory
::
format
format
()
const
{
if
(
layout_
==
DataLayout
::
kMKLDNN
)
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
mem_pd_
.
desc
().
data
.
format
);
}
else
{
return
mkldnn
::
memory
::
format
::
format_undef
;
}
}
inline
void
set_format
(
const
mkldnn
::
memory
::
format
format
)
{
format_
=
format
;
// TODO(jczaja): This is depracted and will be removed
inline
void
set_format
(
const
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
f32
)
{
mem_pd_
=
paddle
::
platform
::
create_prim_desc_from_format
(
paddle
::
framework
::
vectorize2int
(
dims
()),
fmt
,
data_type
);
layout_
=
DataLayout
::
kMKLDNN
;
}
inline
mkldnn
::
memory
::
primitive_desc
get_mkldnn_prim_desc
()
const
{
return
mem_pd_
;
}
inline
void
set_mkldnn_prim_desc
(
const
mkldnn
::
memory
::
primitive_desc
&
mem_pd
)
{
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_
=
mem_pd
;
layout_
=
DataLayout
::
kMKLDNN
;
}
protected:
...
...
@@ -48,12 +76,9 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
*/
mkldnn
::
memory
::
format
format_
=
mkldnn
::
memory
::
format
::
format_undef
;
mutable
mkldnn
::
memory
::
primitive_desc
mem_pd_
;
#endif
public:
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
c4187dbd
...
...
@@ -50,8 +50,6 @@ class Scope;
}
// namespace framework
namespace
operators
{
template
<
typename
T
>
class
AlgorithmsCache
;
class
CudnnRNNCache
;
...
...
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#ifndef _WIN32
ncclUniqueId
,
platform
::
Communicator
,
#endif
operators
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
,
operators
::
CudnnRNNCache
,
#endif
int
,
float
>
;
...
...
paddle/fluid/imperative/layer.cc
浏览文件 @
c4187dbd
...
...
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
framework
::
Scope
scope
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
,
nullptr
));
}
}
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
c4187dbd
...
...
@@ -44,8 +44,13 @@ class PreparedOp {
PreparedOp
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
RuntimeContext
&
ctx
,
framework
::
OperatorWithKernel
::
OpKernelFunc
func
,
platform
::
DeviceContext
*
dev_ctx
)
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
)
{}
platform
::
DeviceContext
*
dev_ctx
,
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
)
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
),
kernel_configs
(
kernel_configs
)
{}
static
PreparedOp
Prepare
(
const
framework
::
RuntimeContext
&
ctx
,
const
framework
::
OperatorWithKernel
&
op
,
...
...
@@ -64,8 +69,9 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
));
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -83,7 +89,9 @@ class PreparedOp {
PADDLE_THROW
(
"op %s does not have kernel for %s"
,
op
.
Type
(),
KernelTypeToString
(
expected_kernel_key
));
}
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
);
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
=
op
.
GetKernelConfig
(
expected_kernel_key
);
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
,
kernel_configs
);
}
inline
platform
::
DeviceContext
*
GetDeviceContext
()
const
{
return
dev_ctx
;
}
...
...
@@ -92,6 +100,7 @@ class PreparedOp {
const
framework
::
RuntimeContext
&
ctx
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
platform
::
DeviceContext
*
dev_ctx
;
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
;
};
class
OpBase
;
...
...
@@ -105,23 +114,23 @@ class VarBase {
public:
VarBase
()
:
VarBase
(
new
framework
::
Variable
(),
new
VarBase
(
true
))
{}
// Owns `var` and `grad`
explicit
VarBase
(
bool
stop_gradient
)
:
VarBase
(
new
framework
::
Variable
(),
stop_gradient
?
nullptr
:
new
VarBase
(
true
),
stop_gradient
)
{}
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
)
:
VarBase
(
var
,
grad
,
false
)
{}
private:
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
,
bool
stop_gradient
)
:
var_desc_
(
nullptr
),
var_
(
var
),
grads_
(
grad
),
stop_gradient_
(
false
),
pre_op_
(
nullptr
),
pre_op_out_idx_
(
-
1
)
{}
explicit
VarBase
(
bool
stop_gradient
)
:
var_desc_
(
nullptr
),
var_
(
new
framework
::
Variable
()),
grads_
(
stop_gradient
?
nullptr
:
new
VarBase
(
true
)),
stop_gradient_
(
stop_gradient
),
pre_op_
(
nullptr
),
pre_op_out_idx_
(
-
1
)
{}
public:
virtual
~
VarBase
()
{
if
(
var_
)
{
delete
var_
;
...
...
@@ -132,11 +141,13 @@ class VarBase {
}
}
OpBase
*
PreOp
()
const
{
return
pre_op_
;
}
int
PreOpOutIdx
()
const
{
return
pre_op_out_idx_
;
}
inline
OpBase
*
PreOp
()
const
{
return
pre_op_
;
}
in
line
in
t
PreOpOutIdx
()
const
{
return
pre_op_out_idx_
;
}
void
SetStopGradient
(
bool
stop_gradient
)
{
stop_gradient_
=
stop_gradient
;
}
bool
IsStopGradient
()
const
{
return
stop_gradient_
;
}
inline
void
SetStopGradient
(
bool
stop_gradient
)
{
stop_gradient_
=
stop_gradient
;
}
inline
bool
IsStopGradient
()
const
{
return
stop_gradient_
;
}
void
RunBackward
();
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
c4187dbd
...
...
@@ -14,6 +14,8 @@
#include "paddle/fluid/imperative/tracer.h"
#include <set>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -66,8 +68,9 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
return
result
;
}
void
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
std
::
set
<
std
::
string
>
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
Place
expected_place
,
const
bool
stop_gradient
)
{
std
::
map
<
std
::
string
,
VarBase
*>
vars
;
...
...
@@ -76,6 +79,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VLOG
(
3
)
<<
"tracer tracing "
<<
op_desc
->
Type
();
op_desc
->
InferShape
(
*
block
);
op_desc
->
InferVarType
(
block
);
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_base
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
...
...
@@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
invars
.
emplace_back
(
inp
->
var_
);
vars
[
inp
->
var_desc_
->
Name
()]
=
inp
;
if
(
inp
->
PreOp
())
{
if
(
inp
->
PreOp
()
&&
!
inp
->
IsStopGradient
()
)
{
op
->
pre_ops_
[
it
.
first
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_out_idx_
[
it
.
first
].
push_back
(
inp
->
PreOpOutIdx
());
}
else
{
...
...
@@ -138,8 +142,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
));
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
,
prepared_op
.
kernel_configs
));
std
::
set
<
std
::
string
>
vars_saved_for_backward
;
if
(
!
stop_gradient
)
{
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
...
...
@@ -160,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
// Forward inputs or outputs.
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
vars_saved_for_backward
.
insert
(
it
.
first
);
}
else
{
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
...
...
@@ -193,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
}
op
->
block_
=
block
;
return
vars_saved_for_backward
;
}
std
::
vector
<
VarBase
*>
Tracer
::
PyTrace
(
OpBase
*
op
,
...
...
@@ -202,7 +211,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
op
->
input_vars_
[
PyLayer
::
kFwdInp
]
=
inputs
;
op
->
output_vars_
[
PyLayer
::
kFwdOut
]
=
PyLayer
::
Apply
(
op
->
forward_id_
,
inputs
);
for
(
VarBase
*
inp
:
inputs
)
{
if
(
inp
->
PreOp
())
{
if
(
inp
->
PreOp
()
&&
!
inp
->
IsStopGradient
()
)
{
op
->
pre_ops_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOp
());
op
->
pre_ops_out_idx_
[
PyLayer
::
kFwdInp
].
push_back
(
inp
->
PreOpOutIdx
());
}
else
{
...
...
paddle/fluid/imperative/tracer.h
浏览文件 @
c4187dbd
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <map>
#include <set>
#include <string>
#include <vector>
...
...
@@ -43,8 +44,9 @@ class Tracer {
virtual
~
Tracer
()
{}
void
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
std
::
set
<
std
::
string
>
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
Place
expected_place
,
const
bool
stop_gradient
=
false
);
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
c4187dbd
...
...
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
sample_prob
tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
...
...
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
endif
()
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
add_subdirectory
(
benchmark
)
paddle/fluid/operators/alloc_continuous_space_op.cc
0 → 100644
浏览文件 @
c4187dbd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
static
framework
::
proto
::
VarType
::
Type
kDefaultDtype
=
framework
::
proto
::
VarType
::
Type
::
VarType_Type_BOOL
;
template
<
typename
DeviceContext
,
typename
T
>
class
AllocContinuousSpaceKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
in_var_names
=
context
.
Inputs
(
"Input"
);
auto
&
out_var_names
=
context
.
Outputs
(
"Output"
);
auto
&
in_vars
=
context
.
MultiInputVar
(
"Input"
);
auto
out_vars
=
context
.
MultiOutputVar
(
"Output"
);
PADDLE_ENFORCE_GT
(
in_var_names
.
size
(),
static_cast
<
size_t
>
(
0
));
PADDLE_ENFORCE_EQ
(
in_var_names
.
size
(),
out_var_names
.
size
());
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
++
i
)
{
// Only support LoDTensor
PADDLE_ENFORCE_NOT_NULL
(
in_vars
[
i
],
"%s should not be nullptr,"
,
in_var_names
[
i
]);
PADDLE_ENFORCE_NOT_NULL
(
out_vars
[
i
],
"%s should not be nullptr,"
,
out_var_names
[
i
]);
PADDLE_ENFORCE
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensor
>
());
PADDLE_ENFORCE
(
out_vars
[
i
]
->
IsType
<
framework
::
LoDTensor
>
());
}
auto
in_tensors
=
context
.
MultiInput
<
framework
::
LoDTensor
>
(
"Input"
);
if
(
context
.
Attr
<
bool
>
(
"check_name"
))
{
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_var_names
[
i
],
out_var_names
[
i
]);
}
}
else
{
// Init the output as input
for
(
size_t
i
=
0
;
i
<
in_tensors
.
size
();
++
i
)
{
out_vars
[
i
]
->
GetMutable
<
framework
::
LoDTensor
>
()
->
Resize
(
in_tensors
[
i
]
->
dims
());
}
}
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
// Get numel and dtype
size_t
numel
=
0
;
auto
dtype
=
kDefaultDtype
;
GetMemSizeAndDtype
(
in_tensors
,
in_var_names
,
&
numel
,
&
dtype
);
// Alloc the continuous space
auto
fused_tensor
=
context
.
Output
<
framework
::
LoDTensor
>
(
"FusedOutput"
);
fused_tensor
->
Resize
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
numel
)}))
.
mutable_data
(
context
.
GetPlace
(),
dtype
);
// Init the continuous space
auto
out_tensors
=
context
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Output"
);
int64_t
offset
=
0
;
if
(
context
.
Attr
<
bool
>
(
"copy_data"
))
{
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
++
i
)
{
int64_t
len
=
out_tensors
[
i
]
->
numel
();
auto
sub_tensor
=
fused_tensor
->
Slice
(
offset
,
offset
+
len
);
offset
+=
len
;
framework
::
TensorCopy
(
*
out_tensors
[
i
],
context
.
GetPlace
(),
dev_ctx
,
&
sub_tensor
);
}
}
else
if
(
context
.
Attr
<
bool
>
(
"set_constant"
))
{
math
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
set_constant
(
dev_ctx
,
fused_tensor
,
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"constant"
)));
}
// Make the outputs point to the continuous space.
offset
=
0
;
for
(
size_t
i
=
0
;
i
<
out_tensors
.
size
();
++
i
)
{
int64_t
len
=
out_tensors
[
i
]
->
numel
();
auto
dim
=
out_tensors
[
i
]
->
dims
();
out_tensors
[
i
]
->
ShareDataWith
(
fused_tensor
->
Slice
(
offset
,
offset
+
len
))
.
Resize
(
dim
);
offset
+=
len
;
VLOG
(
10
)
<<
"alloc_space_for_vars: output("
<<
out_var_names
[
i
]
<<
") ,dim:("
<<
dim
<<
")"
<<
" Address: "
<<
out_tensors
[
i
]
->
data
<
void
>
();
}
}
void
GetMemSizeAndDtype
(
const
std
::
vector
<
const
framework
::
LoDTensor
*>
&
lod_tensors
,
const
std
::
vector
<
std
::
string
>
var_names
,
size_t
*
numel
,
framework
::
proto
::
VarType
::
Type
*
dtype
)
const
{
PADDLE_ENFORCE_EQ
(
lod_tensors
.
size
(),
var_names
.
size
());
*
numel
=
0
;
for
(
size_t
i
=
0
;
i
<
var_names
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
lod_tensors
[
i
]
->
IsInitialized
(),
"%s is not initialized."
,
var_names
[
i
]);
auto
p_dtype
=
lod_tensors
[
i
]
->
type
();
if
(
*
dtype
==
kDefaultDtype
)
{
PADDLE_ENFORCE_NE
(
p_dtype
,
kDefaultDtype
,
"%s's type should not be %s."
,
var_names
[
i
],
kDefaultDtype
);
*
dtype
=
p_dtype
;
}
PADDLE_ENFORCE_EQ
(
p_dtype
,
*
dtype
,
"Input vars is not equal."
);
auto
size
=
lod_tensors
[
i
]
->
numel
();
PADDLE_ENFORCE_GT
(
size
,
0
);
VLOG
(
10
)
<<
"alloc_space_for_vars: input("
<<
var_names
[
i
]
<<
") ,dim:("
<<
lod_tensors
[
i
]
->
dims
()
<<
")"
;
*
numel
+=
size
;
}
}
};
class
AllocContinuousSpaceOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
};
class
AllocContinuousSpaceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Input"
,
"(vector<LoDTensor>) The input tensors of"
" alloc_continuous_space operator."
)
.
AsDuplicable
();
AddOutput
(
"Output"
,
"(vector<LoDTensor>) The output "
"tensors of alloc_continuous_space operator. And the address "
"of output tensors are continuous, they are sliced from the "
"tensor of FusedOutput."
)
.
AsDuplicable
();
AddOutput
(
"FusedOutput"
,
"(LoDTensor) The output tensor "
"of alloc_continuous_space operator. And the tensors of"
" Output is sliced from the tensor of FusedOutput."
);
AddAttr
<
bool
>
(
"copy_data"
,
"Whether to copy the Input value to Output."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"set_constant"
,
"Whether to set the Output with a constant value."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"constant"
,
"If set_constant is true, the constant value will be used "
"to set the Output."
)
.
SetDefault
(
0.0
);
AddAttr
<
bool
>
(
"check_name"
,
"Whether to check the name of Input and Output to ensure "
"they are the same separately."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AllocContinuousSpace Operator.
alloc_continuous_space is used to make the address of Output
continuous according to the Input. This Op will alloc a big tensor
according to the tensors of Input, the dtype is the same with those input tensors,
the size is the sum of those input tensors' numel, and the dim of the big
tensor is {sum(numel)}. And the big tensor is stored in FusedOutput.
The tensors of Output are sliced from the tensor of FusedOutput.
Note that, the dtype of Input should be the same, and the dim of Input
and Output should equal.
The tensors of Input and Output could be the same or different. And
alloc_continuous_space allows copying the value of Input to Output, or
setting the Output with a constant value.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OPERATOR
(
alloc_continuous_space
,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CPU_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
#endif
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
c4187dbd
...
...
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
,
nullptr
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
...
...
paddle/fluid/operators/beam_search_decode_op.h
浏览文件 @
c4187dbd
...
...
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
auto
cpu_place
=
std
::
unique_ptr
<
paddle
::
platform
::
CPUPlace
>
(
new
paddle
::
platform
::
CPUPlace
());
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
.
get
()
);
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
);
framework
::
LoD
lod
;
lod
.
push_back
(
source_level_lod
);
...
...
paddle/fluid/operators/benchmark/CMakeLists.txt
0 → 100644
浏览文件 @
c4187dbd
cc_test
(
op_tester SRCS op_tester.cc op_tester_config.cc
DEPS memory timer framework_proto proto_desc lod_tensor op_registry
device_context scope
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
paddle/fluid/operators/benchmark/op_tester.cc
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h"
#include <fstream>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
DEFINE_string
(
op_config_list
,
""
,
"Path of op config file."
);
DEFINE_int32
(
specified_config_id
,
-
1
,
"Test the specified op config."
);
void
OpTester
::
Init
(
const
std
::
string
&
filename
)
{
Init
(
OpTesterConfig
(
filename
));
}
void
OpTester
::
Init
(
const
OpTesterConfig
&
config
)
{
config_
=
config
;
auto
&
op_desc_info
=
framework
::
OpInfoMap
::
Instance
();
// Initialize the OpDesc
if
(
op_desc_info
.
Has
(
config_
.
op_type
))
{
type_
=
config_
.
op_type
;
op_desc_
.
SetType
(
config_
.
op_type
);
CreateInputVarDesc
();
CreateOutputVarDesc
();
}
else
{
LOG
(
FATAL
)
<<
"Op
\"
"
<<
config_
.
op_type
<<
"
\"
is not registered."
;
}
if
(
config_
.
device_id
>=
0
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device_id
);
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
}
framework
::
InitDevices
(
false
);
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
op_
=
framework
::
OpRegistry
::
CreateOp
(
op_desc_
);
CreateVariables
(
scope_
.
get
());
}
void
OpTester
::
Run
()
{
if
(
config_
.
print_debug_string
)
{
LOG
(
INFO
)
<<
DebugString
();
}
// Warm up
RunImpl
();
platform
::
Timer
timer
;
if
(
config_
.
profile
)
{
if
(
platform
::
is_cpu_place
(
place_
))
{
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kCPU
);
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kAll
);
platform
::
SetDeviceId
(
config_
.
device_id
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
timer
.
Start
();
for
(
int
i
=
config_
.
repeat
;
i
>
0
;
--
i
)
{
RunImpl
();
}
timer
.
Pause
();
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kDefault
,
"op_tester_profiler"
);
}
else
{
timer
.
Start
();
for
(
int
i
=
config_
.
repeat
;
i
>
0
;
--
i
)
{
RunImpl
();
}
timer
.
Pause
();
}
config_
.
runtime
=
timer
.
ElapsedMS
()
/
config_
.
repeat
;
LOG
(
INFO
)
<<
"=== Run "
<<
config_
.
repeat
<<
" times, latency: "
<<
config_
.
runtime
<<
" ms ==="
;
}
void
OpTester
::
RunImpl
()
{
op_
->
Run
(
*
scope_
,
place_
);
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
scope_
->
DropKids
();
}
std
::
vector
<
std
::
string
>
OpTester
::
GetOpProtoInputNames
()
{
std
::
vector
<
std
::
string
>
input_names
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
for
(
int
i
=
0
;
i
!=
proto
.
inputs_size
();
++
i
)
{
const
auto
&
input
=
proto
.
inputs
(
i
);
input_names
.
push_back
(
input
.
name
());
}
return
input_names
;
}
std
::
vector
<
std
::
string
>
OpTester
::
GetOpProtoOutputNames
()
{
std
::
vector
<
std
::
string
>
output_names
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
for
(
int
i
=
0
;
i
!=
proto
.
outputs_size
();
++
i
)
{
const
auto
&
output
=
proto
.
outputs
(
i
);
output_names
.
push_back
(
output
.
name
());
}
return
output_names
;
}
void
OpTester
::
CreateInputVarDesc
()
{
std
::
vector
<
std
::
string
>
input_names
=
GetOpProtoInputNames
();
for
(
auto
&
name
:
input_names
)
{
const
OpInputConfig
*
input
=
config_
.
GetInput
(
name
);
if
(
input
==
nullptr
)
{
LOG
(
FATAL
)
<<
"The input "
<<
name
<<
" of op "
<<
config_
.
op_type
<<
" is not correctlly provided."
;
}
std
::
string
var_name
=
config_
.
op_type
+
"."
+
name
;
framework
::
VarDesc
*
var
=
Var
(
var_name
);
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
var
->
SetShape
(
input
->
dims
);
op_desc_
.
SetInput
(
name
,
{
var_name
});
input_lods_
[
var_name
]
=
input
->
lod
;
}
}
void
OpTester
::
CreateOutputVarDesc
()
{
std
::
vector
<
std
::
string
>
output_names
=
GetOpProtoOutputNames
();
for
(
auto
&
name
:
output_names
)
{
std
::
string
var_name
=
config_
.
op_type
+
"."
+
name
;
framework
::
VarDesc
*
var
=
Var
(
var_name
);
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
op_desc_
.
SetOutput
(
name
,
{
var_name
});
}
}
framework
::
VarDesc
*
OpTester
::
Var
(
const
std
::
string
&
name
)
{
auto
it
=
vars_
.
find
(
name
);
if
(
it
!=
vars_
.
end
())
{
return
it
->
second
.
get
();
}
auto
*
var
=
new
framework
::
VarDesc
(
name
);
vars_
[
name
].
reset
(
var
);
return
var
;
}
template
<
typename
T
>
void
OpTester
::
SetupTensor
(
framework
::
LoDTensor
*
tensor
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
)
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
T
*
ptr
=
tensor
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
for
(
int
i
=
0
;
i
<
tensor
->
numel
();
++
i
)
{
ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
else
{
framework
::
LoDTensor
cpu_tensor
;
T
*
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
TensorCopySync
(
cpu_tensor
,
place_
,
tensor
);
}
}
void
OpTester
::
CreateVariables
(
framework
::
Scope
*
scope
)
{
for
(
auto
&
item
:
vars_
)
{
auto
&
var
=
item
.
second
;
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
continue
;
}
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
framework
::
InitializeVariable
(
ptr
,
var
->
GetType
());
if
(
var
->
Persistable
())
{
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
for
(
auto
&
item
:
input_lods_
)
{
// Allocate memory for input tensor
auto
&
var_name
=
item
.
first
;
VLOG
(
3
)
<<
"Allocate memory for tensor "
<<
var_name
;
auto
&
var_desc
=
vars_
[
var_name
];
std
::
vector
<
int64_t
>
shape
=
var_desc
->
GetShape
();
auto
*
var
=
scope
->
Var
(
var_name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
SetupTensor
<
float
>
(
tensor
,
shape
,
static_cast
<
float
>
(
0.0
),
static_cast
<
float
>
(
1.0
));
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
std
::
vector
<
std
::
vector
<
size_t
>>
&
lod_vec
=
item
.
second
;
framework
::
LoD
lod
;
for
(
size_t
i
=
0
;
i
<
lod_vec
.
size
();
++
i
)
{
lod
.
push_back
(
lod_vec
[
i
]);
}
tensor
->
set_lod
(
lod
);
}
}
static
std
::
string
GenSpaces
(
int
count
)
{
std
::
stringstream
ss
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
ss
<<
" "
;
}
return
ss
.
str
();
}
std
::
string
OpTester
::
DebugString
()
{
std
::
stringstream
ss
;
int
count
=
0
;
for
(
auto
&
item
:
vars_
)
{
auto
&
var
=
item
.
second
;
ss
<<
GenSpaces
(
count
++
)
<<
"vars {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"name:
\"
"
<<
var
->
Name
()
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"type: {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"type: LOD_TENSOR
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"lod_tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"tensor {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP32
\n
"
;
std
::
vector
<
int64_t
>
shape
=
var
->
GetShape
();
for
(
auto
d
:
shape
)
{
ss
<<
GenSpaces
(
count
)
<<
"dims: "
<<
d
<<
"
\n
"
;
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"persistable: "
<<
var
->
Persistable
()
<<
"
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
count
++
)
<<
"ops {
\n
"
;
for
(
auto
&
name
:
op_desc_
.
InputNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"inputs {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"parameters:
\"
"
<<
name
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"arguments:
\"
"
<<
op_desc_
.
Input
(
name
)[
0
]
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
for
(
auto
&
name
:
op_desc_
.
OutputNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"outputs {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"parameters:
\"
"
<<
name
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"arguments:
\"
"
<<
op_desc_
.
Output
(
name
)[
0
]
<<
"
\"\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
count
)
<<
"type: "
<<
op_desc_
.
Type
()
<<
"
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
return
ss
.
str
();
}
TEST
(
op_tester
,
base
)
{
if
(
!
FLAGS_op_config_list
.
empty
())
{
std
::
ifstream
fin
(
FLAGS_op_config_list
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
FLAGS_op_config_list
.
c_str
());
std
::
vector
<
OpTesterConfig
>
op_configs
;
while
(
!
fin
.
eof
())
{
OpTesterConfig
config
;
bool
result
=
config
.
Init
(
fin
);
if
(
result
)
{
op_configs
.
push_back
(
config
);
}
}
if
(
FLAGS_specified_config_id
>=
0
&&
FLAGS_specified_config_id
<
static_cast
<
int
>
(
op_configs
.
size
()))
{
OpTester
tester
;
tester
.
Init
(
op_configs
[
FLAGS_specified_config_id
]);
tester
.
Run
();
}
else
{
for
(
size_t
i
=
0
;
i
<
op_configs
.
size
();
++
i
)
{
OpTester
tester
;
tester
.
Init
(
op_configs
[
i
]);
tester
.
Run
();
}
}
}
else
{
OpTester
tester
;
OpTesterConfig
config
;
config
.
op_type
=
"elementwise_add"
;
config
.
inputs
.
resize
(
2
);
config
.
inputs
[
0
].
name
=
"X"
;
config
.
inputs
[
0
].
dims
=
{
64
,
64
};
config
.
inputs
[
1
].
name
=
"Y"
;
config
.
inputs
[
1
].
dims
=
{
64
,
1
};
tester
.
Init
(
config
);
tester
.
Run
();
}
}
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
class
OpTester
{
public:
OpTester
()
{}
void
Init
(
const
std
::
string
&
filename
);
void
Init
(
const
OpTesterConfig
&
config
);
void
Run
();
std
::
string
DebugString
();
private:
std
::
vector
<
std
::
string
>
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoOutputNames
();
void
CreateInputVarDesc
();
void
CreateOutputVarDesc
();
framework
::
VarDesc
*
Var
(
const
std
::
string
&
name
);
void
CreateVariables
(
framework
::
Scope
*
scope
);
template
<
typename
T
>
void
SetupTensor
(
framework
::
LoDTensor
*
input
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
);
void
RunImpl
();
private:
OpTesterConfig
config_
;
std
::
string
type_
;
framework
::
OpDesc
op_desc_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
framework
::
VarDesc
>>
vars_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
vector
<
size_t
>>>
input_lods_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
platform
::
Place
place_
;
std
::
unique_ptr
<
framework
::
Scope
>
scope_
;
};
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester_config.cc
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
static
const
char
kStartSeparator
[]
=
"{"
;
static
const
char
kEndSeparator
[]
=
"}"
;
static
const
char
kSepBetweenItems
[]
=
";"
;
static
bool
StartWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
find
(
substr
)
==
0
;
}
static
bool
EndWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
rfind
(
substr
)
==
(
str
.
length
()
-
substr
.
length
());
}
static
void
EraseEndSep
(
std
::
string
*
str
,
std
::
string
substr
=
kSepBetweenItems
)
{
if
(
EndWith
(
*
str
,
substr
))
{
str
->
erase
(
str
->
length
()
-
substr
.
length
(),
str
->
length
());
}
}
void
OpInputConfig
::
ParseDims
(
std
::
istream
&
is
)
{
std
::
string
dims_str
;
is
>>
dims_str
;
dims
.
clear
();
std
::
string
token
;
std
::
istringstream
token_stream
(
dims_str
);
while
(
std
::
getline
(
token_stream
,
token
,
'x'
))
{
dims
.
push_back
(
std
::
stoi
(
token
));
}
}
void
OpInputConfig
::
ParseLoD
(
std
::
istream
&
is
)
{
std
::
string
lod_str
;
std
::
string
start_sep
=
std
::
string
(
kStartSeparator
)
+
std
::
string
(
kStartSeparator
);
std
::
string
end_sep
=
std
::
string
(
kEndSeparator
)
+
std
::
string
(
kEndSeparator
);
std
::
string
sep
;
is
>>
sep
;
if
(
StartWith
(
sep
,
start_sep
))
{
lod_str
+=
sep
;
while
(
!
EndWith
(
sep
,
end_sep
))
{
is
>>
sep
;
lod_str
+=
sep
;
}
}
EraseEndSep
(
&
lod_str
);
PADDLE_ENFORCE_GE
(
lod_str
.
length
(),
4U
);
VLOG
(
4
)
<<
"lod: "
<<
lod_str
<<
", length: "
<<
lod_str
.
length
();
// Parse the lod_str
lod
.
clear
();
for
(
size_t
i
=
1
;
i
<
lod_str
.
length
()
-
1
;)
{
if
(
lod_str
[
i
]
==
'{'
)
{
std
::
vector
<
size_t
>
level
;
while
(
lod_str
[
i
]
!=
'}'
)
{
++
i
;
std
::
string
number
;
while
(
lod_str
[
i
]
>=
'0'
&&
lod_str
[
i
]
<=
'9'
)
{
number
+=
lod_str
[
i
];
++
i
;
}
level
.
push_back
(
atoi
(
number
.
c_str
()));
}
lod
.
push_back
(
level
);
}
else
if
(
lod_str
[
i
]
==
'}'
)
{
++
i
;
}
}
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
ParseDims
(
is
);
}
else
if
(
sep
==
"lod"
||
sep
==
"lod:"
)
{
ParseLoD
(
is
);
}
}
}
}
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
filename
.
c_str
());
Init
(
fin
);
}
bool
OpTesterConfig
::
Init
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"op_type"
||
sep
==
"op_type:"
)
{
is
>>
op_type
;
}
else
if
(
sep
==
"device_id"
||
sep
==
"device_id:"
)
{
is
>>
device_id
;
}
else
if
(
sep
==
"repeat"
||
sep
==
"repeat:"
)
{
is
>>
repeat
;
}
else
if
(
sep
==
"profile"
||
sep
==
"profile:"
)
{
is
>>
profile
;
}
else
if
(
sep
==
"print_debug_string"
||
sep
==
"print_debug_string:"
)
{
is
>>
print_debug_string
;
}
else
if
(
sep
==
"input"
||
sep
==
"input:"
)
{
OpInputConfig
input_config
(
is
);
inputs
.
push_back
(
input_config
);
}
else
if
(
sep
==
"attrs"
||
sep
==
"attrs:"
)
{
ParseAttrs
(
is
);
}
else
{
if
(
sep
!=
kEndSeparator
)
{
return
false
;
}
}
}
}
else
{
return
false
;
}
return
true
;
}
bool
OpTesterConfig
::
ParseAttrs
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
true
)
{
std
::
string
key
;
is
>>
key
;
if
(
key
==
kEndSeparator
)
{
break
;
}
std
::
string
value
;
is
>>
value
;
EraseEndSep
(
&
key
,
":"
);
EraseEndSep
(
&
value
);
attrs
[
key
]
=
value
;
}
}
return
true
;
}
const
OpInputConfig
*
OpTesterConfig
::
GetInput
(
const
std
::
string
&
name
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
if
(
inputs
[
i
].
name
==
name
)
{
return
&
inputs
[
i
];
}
}
return
nullptr
;
}
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/benchmark/op_tester_config.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <istream>
#include <string>
#include <unordered_map>
#include <vector>
namespace
paddle
{
namespace
operators
{
namespace
benchmark
{
struct
OpInputConfig
{
OpInputConfig
()
{}
explicit
OpInputConfig
(
std
::
istream
&
is
);
void
ParseDims
(
std
::
istream
&
is
);
void
ParseLoD
(
std
::
istream
&
is
);
std
::
string
name
;
std
::
vector
<
int64_t
>
dims
;
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
};
struct
OpTesterConfig
{
OpTesterConfig
()
{}
explicit
OpTesterConfig
(
const
std
::
string
&
filename
);
bool
Init
(
std
::
istream
&
is
);
bool
ParseAttrs
(
std
::
istream
&
is
);
const
OpInputConfig
*
GetInput
(
const
std
::
string
&
name
);
std
::
string
op_type
;
std
::
vector
<
OpInputConfig
>
inputs
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
attrs
;
int
device_id
{
-
1
};
// CPU: -1
int
repeat
{
1
};
int
profile
{
0
};
int
print_debug_string
{
0
};
double
runtime
{
0.0
};
};
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
c4187dbd
...
...
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
...
...
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
...
...
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>*
data_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdDataAlgoCache
))
{
data_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
else
{
data_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>&
data_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
(
0
);
data_algo
=
data_algo_cache
->
GetAlgorithm
(
data_algo
=
data_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
...
...
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>*
f_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdFilterAlgoCache
))
{
f_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
else
{
f_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>&
f_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
(
1
);
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
filter_algo
=
f_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
c4187dbd
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <functional>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
...
...
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
5
;
#endif
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
c4187dbd
...
...
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
...
...
@@ -139,37 +141,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}
return
fwd_perf_stat
[
0
].
algo
;
};
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
search_times
=
std
::
max
(
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
// TODO(dangqingqing): Unify this if-else.
if
(
search_times
>
0
)
{
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto
var_name
=
ctx
.
Inputs
(
"AlgoCache"
)[
0
];
algo_cache
=
ctx
.
scope
()
.
FindVar
(
var_name
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
}
else
{
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
// TODO(qingqing) remove const_cast
algo_cache
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
())
->
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
c4187dbd
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
...
...
@@ -80,6 +81,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
framework
::
OpKernelType
::
kDefaultCustomizedTypeValue
;
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
auto
input_data_type
=
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
();
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout
=
framework
::
StringToDataLayout
(
data_format
);
...
...
@@ -93,11 +95,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
customized_type_value
=
kConvMKLDNNFP32
;
customized_type_value
=
(
input_data_type
==
framework
::
DataTypeTrait
<
int8_t
>::
DataType
||
input_data_type
==
framework
::
DataTypeTrait
<
uint8_t
>::
DataType
)
?
kConvMKLDNNINT8
:
kConvMKLDNNFP32
;
}
#endif
auto
input_data_type
=
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
();
if
(
input_data_type
!=
framework
::
proto
::
VarType
::
INT8
&&
input_data_type
!=
framework
::
proto
::
VarType
::
UINT8
)
{
auto
filter_data_type
=
ctx
.
Input
<
Tensor
>
(
"Filter"
)
->
type
();
...
...
@@ -109,8 +114,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"
);
}
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
auto
type
=
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
// TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
// to false. It should be fixed and then here should only create if library
// is kCUDNN.
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
());
configs
.
push_back
(
p
);
}
#endif
return
type
;
}
void
Conv2DOpMaker
::
Make
()
{
...
...
@@ -410,9 +427,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
auto
type
=
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
if
(
library_
==
framework
::
LibraryType
::
kCUDNN
)
{
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
());
configs
.
push_back
(
p
);
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
p2
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
());
configs
.
push_back
(
p2
);
}
}
#endif
return
type
;
}
class
Conv2dGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
...
paddle/fluid/operators/cross_entropy_op.cc
浏览文件 @
c4187dbd
...
...
@@ -32,14 +32,23 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
int
rank
=
x_dims
.
size
();
PADDLE_ENFORCE_EQ
(
rank
,
label_dims
.
size
(),
"Input(X) and Input(Label) shall have the same rank."
);
bool
check
=
true
;
if
((
!
ctx
->
IsRuntime
())
&&
(
framework
::
product
(
x_dims
)
<=
0
||
framework
::
product
(
label_dims
)
<=
0
))
{
check
=
false
;
}
if
(
check
)
{
PADDLE_ENFORCE_EQ
(
framework
::
slice_ddim
(
x_dims
,
0
,
rank
-
1
),
framework
::
slice_ddim
(
label_dims
,
0
,
rank
-
1
),
"Input(X) and Input(Label) shall have the same shape "
"except the last dimension."
);
}
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"soft_label"
))
{
if
(
check
)
{
PADDLE_ENFORCE_EQ
(
x_dims
[
rank
-
1
],
label_dims
[
rank
-
1
],
"If Attr(soft_label) == true, the last dimension of "
"Input(X) and Input(Label) should be equal."
);
}
}
else
{
PADDLE_ENFORCE_EQ
(
label_dims
[
rank
-
1
],
1UL
,
"If Attr(softLabel) == false, the last dimension of "
...
...
@@ -82,6 +91,14 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"Input(Y@Grad) and Input(X) should have the same rank."
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
rank
,
"Input(Label) and Input(X) should have the same rank."
);
bool
check
=
true
;
if
((
!
ctx
->
IsRuntime
())
&&
(
framework
::
product
(
x_dims
)
<=
0
||
framework
::
product
(
label_dims
)
<=
0
))
{
check
=
false
;
}
if
(
check
)
{
PADDLE_ENFORCE_EQ
(
framework
::
slice_ddim
(
x_dims
,
0
,
rank
-
1
),
framework
::
slice_ddim
(
label_dims
,
0
,
rank
-
1
),
"The Input(X) and Input(Label) should have the same "
...
...
@@ -90,12 +107,16 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
framework
::
slice_ddim
(
dy_dims
,
0
,
rank
-
1
),
"The Input(X) and Input(Y@Grad) should have the same "
"shape except the last dimension."
);
}
PADDLE_ENFORCE_EQ
(
dy_dims
[
rank
-
1
],
1
,
"The last dimension of Input(Y@Grad) should be 1."
);
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"soft_label"
))
{
PADDLE_ENFORCE_EQ
(
x_dims
[
rank
-
1
],
label_dims
[
rank
-
1
],
if
(
check
)
{
PADDLE_ENFORCE_EQ
(
x_dims
[
rank
-
1
],
label_dims
[
rank
-
1
],
"When Attr(soft_label) == true, the last dimension of "
"Input(X) and Input(Label) should be equal."
);
}
}
else
{
PADDLE_ENFORCE_EQ
(
label_dims
[
rank
-
1
],
1
,
"When Attr(soft_label) == false, the last dimension of "
...
...
paddle/fluid/operators/data_norm_op.cc
浏览文件 @
c4187dbd
...
...
@@ -140,9 +140,6 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
"Scales of the history data batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Data Normalization.
...
...
paddle/fluid/operators/detection/prior_box_op.h
浏览文件 @
c4187dbd
...
...
@@ -172,6 +172,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
framework
::
make_ddim
({
1
,
static_cast
<
int
>
(
variances
.
size
())}),
ctx
.
GetPlace
());
auto
var_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
var_t
);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
size_t
i
=
0
;
i
<
variances
.
size
();
++
i
)
{
var_et
(
0
,
i
)
=
variances
[
i
];
}
...
...
@@ -181,8 +185,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
vars
->
Resize
({
box_num
,
static_cast
<
int
>
(
variances
.
size
())});
auto
e_vars
=
framework
::
EigenMatrix
<
T
,
Eigen
::
RowMajor
>::
From
(
*
vars
);
e_vars
=
var_et
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
box_num
,
1
));
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for
(
int
i
=
0
;
i
<
box_num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
variances
.
size
();
++
j
)
{
e_vars
(
i
,
j
)
=
variances
[
j
];
}
}
vars
->
Resize
(
var_dim
);
}
};
// namespace operators
...
...
paddle/fluid/operators/detection/yolov3_loss_op.cc
浏览文件 @
c4187dbd
...
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss."
)
.
SetDefault
(
0.7
);
AddComment
(
R"DOC(
This operator generate
yolov3 loss by
given predict result and ground
This operator generate
s yolov3 loss based on
given predict result and ground
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
should be the same,
specify the grid size, each grid point predict given
number boxes, this given number is specified by anchors, it should be
half anchors length, which following will be represented as S. In the
second dimention(the channel dimention), C should be S * (class_num + 5),
c
lass_num is the box categoriy number of source dataset(such as coco),
s
o in the second dimention, stores 4 box location coordinates x, y, w, h
a
nd
confidence score of the box and class one-hot key of each anchor box.
should be the same,
H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors, In the second dimension(the channel
dimension), C should be equal to S * (class_num + 5), class_num is the object
c
ategory number of source dataset(such as 80 in coco dataset), so in the
s
econd(channel) dimension, apart from 4 box location coordinates x, y, w, h,
a
lso includes
confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$
, the box predictions
correspnd to
:
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`
, the box predictions
should be as follows
:
$$
b_x = \sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
is specified by anchors.
In the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h`
is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger th
e
n ignore
the max IoU should be 1, and if the anchor box has IoU bigger th
a
n ignore
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
...
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
calculated as follow
s
.
$$
weight_{box} = 2.0 - t_w * t_h
$$
Final loss will be represented as follow.
Final loss will be represented as follow
s
.
$$
loss = (loss_{xy} + loss_{wh}) * weight_{box}
...
...
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -77,8 +77,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
}
else
{
functor
.
RunMidWise
(
n
,
pre
,
post
);
}
z
->
set_layout
(
DataLayout
::
kMKLDNN
);
z
->
set_format
(
x
->
format
());
z
->
set_mkldnn_prim_desc
(
x
->
get_mkldnn_prim_desc
());
}
else
{
PADDLE_ENFORCE
(
x
->
layout
()
==
DataLayout
::
kMKLDNN
&&
x
->
format
()
!=
memory
::
format
::
format_undef
,
...
...
@@ -116,7 +115,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
auto
sum_pd
=
sum
::
primitive_desc
(
dst_md
,
scales
,
srcs_pd
);
// create mkldnn memory for dst
memory
dst_memory
=
memory
(
sum_pd
.
dst_primitive_desc
(),
z_data
);
auto
dst_mem_pd
=
sum_pd
.
dst_primitive_desc
();
memory
dst_memory
=
memory
(
dst_mem_pd
,
z_data
);
std
::
vector
<
primitive
::
at
>
inputs
;
inputs
.
push_back
(
srcs
[
0
]);
...
...
@@ -129,9 +129,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
pipeline
.
push_back
(
sum_prim
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
z
->
set_layout
(
DataLayout
::
kMKLDNN
);
z
->
set_format
(
(
memory
::
format
)
dst_memory
.
get_primitive_desc
().
desc
().
data
.
format
);
z
->
set_mkldnn_prim_desc
(
dst_mem_pd
);
}
}
};
...
...
@@ -152,24 +150,19 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
auto
set_mkldnn_format
=
[](
Tensor
*
in
,
const
Tensor
*
out
)
{
in
->
set_layout
(
DataLayout
::
kMKLDNN
);
in
->
set_format
(
out
->
format
());
};
if
(
dx
!=
nullptr
&&
dy
!=
nullptr
&&
dx
->
dims
()
==
dy
->
dims
())
{
if
(
dx
->
dims
()
==
dy
->
dims
())
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
if
(
dx
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dx
,
dout
);
dx
->
set_mkldnn_prim_desc
(
dout
->
get_mkldnn_prim_desc
()
);
}
if
(
dy
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dy
,
dout
);
dy
->
set_mkldnn_prim_desc
(
dout
->
get_mkldnn_prim_desc
()
);
}
}
}
else
{
...
...
paddle/fluid/operators/fake_quantize_op.cc
浏览文件 @
c4187dbd
...
...
@@ -31,7 +31,7 @@ template <typename T>
struct
FindAbsMaxFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
in
,
const
int
num
,
T
*
out
)
{
*
out
=
*
(
std
::
max_element
(
in
+
0
,
in
+
num
,
Compare
<
T
>
(
)));
*
out
=
std
::
abs
(
*
(
std
::
max_element
(
in
+
0
,
in
+
num
,
Compare
<
T
>
()
)));
}
};
...
...
@@ -46,10 +46,8 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
trans
(
ctx
,
in
.
data
<
T
>
(),
in
.
data
<
T
>
()
+
in
.
numel
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
ClipFunctor
<
T
>
(
-
s
,
s
));
auto
in_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
in
);
auto
out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
out_e
.
device
(
*
ctx
.
eigen_device
())
=
(
bin_cnt
/
s
*
in_e
).
round
();
out_e
.
device
(
*
ctx
.
eigen_device
())
=
(
bin_cnt
/
s
*
out_e
).
round
();
}
};
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
c4187dbd
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
*
Licensed under the Apache License, Version 2.0 (the "License");
*
you may not use this file except in compliance with the License.
*
You may obtain a copy of the License at
*
*
http://www.apache.org/licenses/LICENSE-2.0
*
*
Unless required by applicable law or agreed to in writing, software
*
distributed under the License is distributed on an "AS IS" BASIS,
*
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
See the License for the specific language governing permissions and
*
limitations under the License. */
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include <string>
...
...
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
const
typename
jit
::
SeqPoolTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
0
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
static_cast
<
size_t
>
(
0
)
);
int
w
=
yref
.
size
();
std
::
vector
<
T
>
y
(
w
);
const
T
*
x_data
=
x
.
data
();
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
c4187dbd
...
...
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value
.
output_value
=
out_t
.
data
<
T
>
();
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
T
cell_clip
=
0.0
;
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
device_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
}
...
...
@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_value
.
output_value
=
nullptr
;
lstm_grad
.
state_active_grad
=
nullptr
;
int
cur_batch_size
=
bend
-
bstart
;
T
cell_clip
=
0.0
;
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
if
(
n
>
0
)
{
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
...
...
paddle/fluid/operators/lstmp_op.cc
浏览文件 @
c4187dbd
...
...
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"C0"
),
"Input(C0) of LSTMP operator should not be null after "
"Input(H0) provided."
);
auto
h_dims
=
ctx
->
GetInputDim
(
"H0"
);
auto
c_dims
=
ctx
->
GetInputDim
(
"C0"
);
PADDLE_ENFORCE
(
h_dims
==
c_dims
,
"The dimension of Input(H0) and Input(C0) "
"should be the same."
);
ctx
->
SetOutputDim
(
"OrderedP0"
,
{
h_dims
[
0
],
proj_dims
[
1
]});
}
auto
b_dims
=
ctx
->
GetInputDim
(
"Bias"
);
...
...
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"This LoDTensor is obtained in the forward and used in the "
"backward."
)
.
AsIntermediate
();
AddOutput
(
"OrderedP0"
,
"(Tensor) the projection of the initial hidden state "
"H0. This is a tensor with shape (N x P), where N is the "
"batch size and P is the hidden size."
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"use_peepholes"
,
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections."
)
...
...
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"whether to compute reversed LSTMP."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"cell_clip"
,
"(float, defalut: 0.0) "
"Clip for Tensor for cell state tensor when clip value is "
"greater than 0.0"
)
.
SetDefault
(
0.0
);
AddAttr
<
float
>
(
"proj_clip"
,
"(float, defalut: 0.0) "
"Clip for Tensor for projection tensor when clip value is "
"greater than 0.0"
)
.
SetDefault
(
0.0
);
AddAttr
<
std
::
string
>
(
"gate_activation"
,
"(string, default: sigmoid)"
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
c4187dbd
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
...
...
@@ -21,17 +22,50 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
using
platform
::
Transform
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
class
_ClipFunctor
{
public:
explicit
_ClipFunctor
(
const
T
min
,
const
T
max
)
:
min_
(
min
),
max_
(
max
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
<
min_
)
return
min_
;
else
if
(
x
>
max_
)
return
max_
;
else
return
x
;
}
private:
T
min_
;
T
max_
;
};
template
<
typename
T
>
class
_ClipGradFunctor
{
public:
explicit
_ClipGradFunctor
(
const
T
min
,
const
T
max
)
:
min_
(
min
),
max_
(
max
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
,
const
T
&
y
)
const
{
return
(
y
>
min_
&&
y
<
max_
)
?
x
:
0
;
}
private:
T
min_
;
T
max_
;
};
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
...
...
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
ordered_proj0
=
ctx
.
Output
<
Tensor
>
(
"OrderedP0"
);
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
proj_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"proj_clip"
));
auto
cell_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"cell_clip"
));
auto
*
batch_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
proj_out
=
ctx
.
Output
<
LoDTensor
>
(
"Projection"
);
...
...
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
}
lstmp_value
.
prev_state_value
=
nullptr
;
Tensor
ordered_c0
;
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
...
...
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor
ordered_h0
;
ordered_proj0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
blas
.
MatMul
(
ordered_h0
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
ordered_proj0
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
ActCompute
(
cell_act
,
place
,
proj0_dev
,
proj0_dev
);
}
blas
.
MatMul
(
*
ordered_proj0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
blas
.
MatMul
(
ordered_h0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
...
...
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
lstmp_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstmp_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstmp_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
device_ctx
,
lstmp_value
,
frame_size
,
cur_batch_size
,
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
lstmp_value
.
prev_state_value
=
lstmp_value
.
state_value
;
blas
.
MatMul
(
hidden_t
,
false
,
*
proj_weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
proj_t
,
static_cast
<
T
>
(
0.0
));
...
...
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto
proj_t_dev
=
EigenMatrix
<
T
>::
From
(
proj_t
);
ActCompute
(
cell_act
,
place
,
proj_t_dev
,
proj_t_dev
);
}
if
(
proj_clip
&&
proj_clip
>
0.0
)
{
T
*
x_data
=
proj_t
.
data
<
T
>
();
int64_t
numel
=
proj_t
.
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
x_data
,
x_data
+
numel
,
x_data
,
_ClipFunctor
<
T
>
(
-
1.0
*
proj_clip
,
proj_clip
));
}
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
...
...
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto
*
proj_out
=
ctx
.
Input
<
LoDTensor
>
(
"Projection"
);
auto
*
cell_out
=
ctx
.
Input
<
LoDTensor
>
(
"Cell"
);
auto
proj_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"proj_clip"
));
auto
cell_clip
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"cell_clip"
));
auto
*
batch_gate
=
ctx
.
Input
<
LoDTensor
>
(
"BatchGate"
);
auto
*
batch_cell_pre_act
=
ctx
.
Input
<
LoDTensor
>
(
"BatchCellPreAct"
);
auto
*
batch_hidden
=
ctx
.
Input
<
LoDTensor
>
(
"BatchHidden"
);
...
...
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto
*
bias_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
ordered_proj0
=
ctx
.
Input
<
Tensor
>
(
"OrderedP0"
);
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
h0_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"H0"
));
...
...
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor
cur_proj
=
batch_proj
.
Slice
(
bstart
,
bend
);
Tensor
proj_g
=
batch_proj_g
.
Slice
(
bstart
,
bend
);
if
(
proj_clip
&&
proj_clip
>
0.0
)
{
T
*
dx_data
=
proj_g
.
data
<
T
>
();
T
*
x_data
=
cur_proj
.
data
<
T
>
();
int64_t
numel
=
proj_g
.
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
dx_data
,
dx_data
+
numel
,
x_data
,
dx_data
,
_ClipGradFunctor
<
T
>
(
-
1.0
*
proj_clip
,
proj_clip
));
}
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
cur_proj_dev
=
EigenMatrix
<
T
>::
From
(
cur_proj
);
auto
proj_g_dev
=
EigenMatrix
<
T
>::
From
(
proj_g
);
...
...
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstmp_value
,
lstmp_grad
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
cell_clip
,
gate_act
,
cell_act
,
cand_act
);
if
(
n
>
0
)
{
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
...
...
@@ -431,32 +480,15 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
ReorderInitState
<
DeviceContext
,
T
>
(
device_ctx
,
*
h0
,
order
,
&
ordered_h0
,
true
);
if
(
weight_g
)
{
blas
.
MatMul
(
*
ordered_proj0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
),
weight_g
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
ordered_h0
,
true
,
gate_g
,
false
,
static_cast
<
T
>
(
1.0
)
,
weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
if
(
h0
&&
(
h0_g
||
proj_weight_g
))
{
ordered_h0_g
.
mutable_data
<
T
>
(
h0_g
->
dims
(),
ctx
.
GetPlace
());
Tensor
proj0_g
;
proj0_g
.
Resize
({
in_dims
[
0
],
proj_weight
->
dims
()[
1
]});
proj0_g
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
blas
.
MatMul
(
gate_g
,
false
,
*
weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
proj0_g
,
static_cast
<
T
>
(
0.0
));
if
(
proj_act
!=
math
::
detail
::
ActivationType
::
kIdentity
)
{
auto
proj0_dev
=
EigenMatrix
<
T
>::
From
(
*
ordered_proj0
);
auto
proj0_g_dev
=
EigenMatrix
<
T
>::
From
(
proj0_g
);
ActGradCompute
(
cell_act
,
place
,
proj0_dev
,
proj0_dev
,
proj0_g_dev
,
proj0_g_dev
);
}
if
(
h0_g
)
{
blas
.
MatMul
(
proj0_g
,
false
,
*
proj_weight
,
true
,
static_cast
<
T
>
(
1.0
),
&
ordered_h0_g
,
static_cast
<
T
>
(
0.0
));
}
if
(
proj_weight_g
)
{
blas
.
MatMul
(
ordered_h0
,
true
,
proj0_g
,
false
,
static_cast
<
T
>
(
1.0
),
proj_weight_g
,
static_cast
<
T
>
(
1.0
));
}
}
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
c4187dbd
...
...
@@ -39,6 +39,7 @@ math_library(cross_entropy)
math_library
(
cos_sim_functor
)
math_library
(
depthwise_conv DEPS cub
)
math_library
(
im2col
)
math_library
(
sample_prob
)
math_library
(
sampler
)
math_library
(
gru_compute DEPS activation_functions math_function
)
...
...
paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
浏览文件 @
c4187dbd
...
...
@@ -32,7 +32,8 @@ namespace detail {
template
<
class
T
,
class
Op
>
void
naive_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
T
r_value_in
;
...
...
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value_in
[
i
]
=
r_value_in
;
value_ig
[
i
]
=
r_value_ig
;
...
...
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
void
naive_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
T
r_value_in
;
...
...
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
grad_in
[
i
]
=
r_grad_in
;
grad_ig
[
i
]
=
r_grad_ig
;
...
...
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
void
avx_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
#ifdef __AVX__
...
...
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value_in
[
i
]
=
r_value_in
;
value_ig
[
i
]
=
r_value_ig
;
...
...
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
void
avx_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
#ifdef __AVX__
...
...
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
grad_in
[
i
]
=
r_grad_in
;
grad_ig
[
i
]
=
r_grad_ig
;
...
...
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
void
cpu_lstm_forward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
ActivationType
active_node
,
ActivationType
active_gat
e
,
ActivationType
active_state
)
{
T
cell_clip
,
ActivationType
active_nod
e
,
ActivationType
active_
gate
,
ActivationType
active_
state
)
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
active_node
,
active_gate
,
active_state
);
avx_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
cell_clip
,
active_
node
,
active_
gate
,
active_state
);
}
else
{
naive_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
active_node
,
active_gate
,
active_state
);
naive_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frame_size
,
cell_clip
,
active_
node
,
active_
gate
,
active_state
);
}
}
template
<
class
T
,
class
Op
>
void
cpu_lstm_backward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
ActivationType
active_node
,
int
frame_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
if
(
Op
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
std
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
active_node
,
active_gate
,
active_state
);
avx_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
cell_clip
,
active_
node
,
active_
gate
,
active_state
);
}
else
{
naive_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
naive_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frame_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
}
}
...
...
paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
浏览文件 @
c4187dbd
...
...
@@ -31,7 +31,8 @@ namespace detail {
*/
template
<
class
T
,
class
Op
,
bool
is_batch
>
__global__
void
KeLstmForward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_prev_state
,
&
r_state
,
&
r_state_atv
,
&
r_out
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
active_node
,
active_gate
,
active_state
);
&
cell_clip
,
active_node
,
active_gate
,
active_state
);
value
.
gate_value
[
frame_idx
]
=
r_value_in
;
value
.
gate_value
[
frame_idx
+
frame_size
]
=
r_value_ig
;
...
...
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
template
<
class
T
,
class
Op
,
bool
is_batch
>
__global__
void
KeLstmBackward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
op
(
&
r_value_in
,
&
r_value_ig
,
&
r_value_fg
,
&
r_value_og
,
&
r_grad_in
,
&
r_grad_ig
,
&
r_grad_fg
,
&
r_grad_og
,
&
r_prev_state
,
&
r_prev_state_grad
,
&
r_state
,
&
r_state_grad
,
&
r_state_atv
,
&
r_output_grad
,
&
r_checkI
,
&
r_checkF
,
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
active_node
,
active_gate
,
active_state
);
&
r_checkO
,
&
r_checkIGrad
,
&
r_checkFGrad
,
&
r_checkOGrad
,
&
cell_clip
,
active_
node
,
active_
gate
,
active_state
);
grad
.
gate_grad
[
frame_idx
]
=
r_grad_in
;
grad
.
gate_grad
[
frame_idx
+
frame_size
]
=
r_grad_ig
;
...
...
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
template
<
class
T
,
class
Op
>
void
gpu_lstm_forward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
ActivationType
active_gat
e
,
ActivationType
active_state
)
{
T
cell_clip
,
ActivationType
active_nod
e
,
ActivationType
active_
gate
,
ActivationType
active_
state
)
{
dim3
threads
;
dim3
grid
;
if
(
batch_size
==
1
)
{
...
...
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
if
(
batch_size
==
1
)
{
KeLstmForward
<
T
,
Op
,
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frame_size
,
batch_size
,
active_node
,
active_gate
,
op
,
value
,
frame_size
,
batch_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
}
else
{
KeLstmForward
<
T
,
Op
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frame_size
,
batch_size
,
active_node
,
active_gate
,
op
,
value
,
frame_size
,
batch_size
,
cell_clip
,
active_node
,
active_gate
,
active_state
);
}
}
...
...
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template
<
class
T
,
class
Op
>
void
gpu_lstm_backward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
dim3
threads
;
...
...
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
if
(
batch_size
==
1
)
{
KeLstmBackward
<
T
,
Op
,
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frame_size
,
batch_size
,
active_node
,
active_gat
e
,
active_state
);
op
,
value
,
grad
,
frame_size
,
batch_size
,
cell_clip
,
active_nod
e
,
active_
gate
,
active_
state
);
}
else
{
KeLstmBackward
<
T
,
Op
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frame_size
,
batch_size
,
active_node
,
active_gat
e
,
active_state
);
op
,
value
,
grad
,
frame_size
,
batch_size
,
cell_clip
,
active_nod
e
,
active_
gate
,
active_
state
);
}
}
...
...
paddle/fluid/operators/math/detail/lstm_kernel.h
浏览文件 @
c4187dbd
...
...
@@ -29,7 +29,7 @@ class lstm {
public:
HOSTDEVICE
void
operator
()(
T
*
value_in
,
T
*
value_ig
,
T
*
value_fg
,
T
*
value_og
,
T
*
prev_state
,
T
*
state
,
T
*
state_atv
,
T
*
output
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
...
...
@@ -37,6 +37,15 @@ class lstm {
*
value_ig
=
activation
(
*
value_ig
+
(
*
prev_state
)
*
(
*
checkI
),
active_gate
);
*
value_fg
=
activation
(
*
value_fg
+
(
*
prev_state
)
*
(
*
checkF
),
active_gate
);
*
state
=
(
*
value_in
)
*
(
*
value_ig
)
+
(
*
prev_state
)
*
(
*
value_fg
);
if
(
*
cell_clip
>
0.0
)
{
if
(
*
state
<
-
1.0
*
(
*
cell_clip
))
{
*
state
=
-
1.0
*
(
*
cell_clip
);
}
if
(
*
state
>
*
cell_clip
)
{
*
state
=
*
cell_clip
;
}
}
*
value_og
=
activation
(
*
value_og
+
(
*
state
)
*
(
*
checkO
),
active_gate
);
*
state_atv
=
activation
(
*
state
,
active_state
);
*
output
=
(
*
value_og
)
*
(
*
state_atv
);
...
...
@@ -52,7 +61,7 @@ class lstm {
__m256
*
value_fg
,
__m256
*
value_og
,
__m256
*
prev_state
,
__m256
*
state
,
__m256
*
state_atv
,
__m256
*
output
,
__m256
*
checkI
,
__m256
*
checkF
,
__m256
*
checkO
,
__m256
*
checkF
,
__m256
*
checkO
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
...
...
@@ -65,6 +74,13 @@ class lstm {
active_gate
);
*
state
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
value_in
,
*
value_ig
),
_mm256_mul_ps
(
*
prev_state
,
*
value_fg
));
if
(
*
cell_clip
>
0.0
f
)
{
__m256
min
=
_mm256_set1_ps
(
0.0
f
-
*
cell_clip
);
__m256
max
=
_mm256_set1_ps
(
*
cell_clip
);
*
state
=
_mm256_min_ps
(
max
,
*
state
);
*
state
=
_mm256_max_ps
(
min
,
*
state
);
}
*
value_og
=
activation
(
_mm256_add_ps
(
*
value_og
,
_mm256_mul_ps
(
*
state
,
*
checkO
)),
active_gate
);
*
state_atv
=
activation
(
*
state
,
active_state
);
...
...
@@ -86,15 +102,26 @@ class lstm {
T
*
prev_state
,
T
*
prev_state_grad
,
T
*
state
,
T
*
state_grad
,
T
*
state_atv
,
T
*
output_grad
,
T
*
checkI
,
T
*
checkF
,
T
*
checkO
,
T
*
checkIGrad
,
T
*
checkFGrad
,
T
*
checkOGrad
,
T
*
checkFGrad
,
T
*
checkOGrad
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
*
grad_og
=
activation
((
*
output_grad
)
*
(
*
state_atv
),
*
value_og
,
active_gate
);
if
(
*
cell_clip
>
0.0
f
)
{
if
(
*
state
>=
(
*
cell_clip
)
||
*
state
<=
(
0.0
f
-
(
*
cell_clip
)))
{
*
state_grad
=
0.0
f
;
}
else
{
*
state_grad
+=
activation
((
*
output_grad
)
*
(
*
value_og
),
*
state_atv
,
active_state
)
+
(
*
grad_og
)
*
(
*
checkO
);
}
}
else
{
*
state_grad
+=
activation
((
*
output_grad
)
*
(
*
value_og
),
*
state_atv
,
active_state
)
+
(
*
grad_og
)
*
(
*
checkO
);
}
*
grad_in
=
activation
((
*
state_grad
)
*
(
*
value_ig
),
*
value_in
,
active_node
);
*
grad_ig
=
activation
((
*
state_grad
)
*
(
*
value_in
),
*
value_ig
,
active_gate
);
*
grad_fg
=
...
...
@@ -117,15 +144,24 @@ class lstm {
__m256
*
prev_state
,
__m256
*
prev_state_grad
,
__m256
*
state
,
__m256
*
state_grad
,
__m256
*
state_atv
,
__m256
*
output_grad
,
__m256
*
checkI
,
__m256
*
checkF
,
__m256
*
checkO
,
__m256
*
checkIGrad
,
__m256
*
checkFGrad
,
__m256
*
checkOGrad
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
__m256
*
checkFGrad
,
__m256
*
checkOGrad
,
T
*
cell_clip
,
ActivationType
active_node
,
ActivationType
active_gate
,
ActivationType
active_state
)
{
*
grad_og
=
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
state_atv
),
*
value_og
,
active_gate
);
if
(
*
cell_clip
>
0.0
f
)
{
T
*
state_
=
reinterpret_cast
<
T
*>
(
state
);
if
(
*
state_
>=
(
*
cell_clip
)
||
*
state_
<=
(
0.0
f
-
(
*
cell_clip
)))
{
*
state_grad
=
_mm256_set1_ps
(
0.0
f
);
}
else
{
*
state_grad
=
_mm256_add_ps
(
activation
(
_mm256_mul_ps
(
*
output_grad
,
*
value_og
),
*
state_atv
,
active_state
),
*
state_grad
);
*
state_grad
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
grad_og
,
*
checkO
),
*
state_grad
);
*
state_grad
=
_mm256_add_ps
(
_mm256_mul_ps
(
*
grad_og
,
*
checkO
),
*
state_grad
);
}
}
*
grad_in
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_ig
),
*
value_in
,
active_node
);
*
grad_ig
=
activation
(
_mm256_mul_ps
(
*
state_grad
,
*
value_in
),
*
value_ig
,
...
...
paddle/fluid/operators/math/lstm_compute.cc
浏览文件 @
c4187dbd
...
...
@@ -24,12 +24,12 @@ template <class T>
struct
LstmUnitFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
cpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame_size
,
cand_act
,
gate_act
,
cell_act
);
c
ell_clip
,
c
and_act
,
gate_act
,
cell_act
);
value
.
gate_value
+=
frame_size
*
4
;
value
.
state_value
+=
frame_size
;
value
.
state_active_value
+=
frame_size
;
...
...
@@ -45,13 +45,14 @@ template <class T>
struct
LstmUnitGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
cpu_lstm_backward
(
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame_size
,
cand_act
,
gate_act
,
cell_act
);
frame_size
,
cell_clip
,
cand_act
,
gate_act
,
cell_act
);
value
.
gate_value
+=
frame_size
*
4
;
value
.
state_value
+=
frame_size
;
...
...
paddle/fluid/operators/math/lstm_compute.cu
浏览文件 @
c4187dbd
...
...
@@ -24,12 +24,12 @@ template <class T>
struct
LstmUnitFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
detail
::
gpu_lstm_forward
<
T
>
(
context
,
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame_size
,
batch_size
,
c
and_act
,
gate
_act
,
cell_act
);
frame_size
,
batch_size
,
c
ell_clip
,
cand
_act
,
gate_act
,
cell_act
);
}
};
...
...
@@ -37,13 +37,13 @@ template <class T>
struct
LstmUnitGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
)
{
detail
::
gpu_lstm_backward
(
context
,
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
c
and_act
,
gate
_act
,
cell_act
);
frame_size
,
batch_size
,
c
ell_clip
,
cand
_act
,
gate_act
,
cell_act
);
}
};
...
...
paddle/fluid/operators/math/lstm_compute.h
浏览文件 @
c4187dbd
...
...
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
class
LstmUnitFunctor
{
public:
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
);
...
...
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
public:
static
void
compute
(
const
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
&
gate_act
,
T
cell_clip
,
const
detail
::
ActivationType
&
gate_act
,
const
detail
::
ActivationType
&
cell_act
,
const
detail
::
ActivationType
&
cand_act
);
};
...
...
paddle/fluid/operators/math/sample_prob.cc
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/sample_prob.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
float
>;
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sample_prob.cu
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/random.h>
#include <thrust/sort.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
__device__
T
gpu_adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
class
GPULogUniformSampler
{
public:
__device__
int64_t
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
;
__device__
float
Probability
(
int64_t
value
,
const
float
log_range
)
const
;
};
__device__
int64_t
GPULogUniformSampler
::
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
{
// Got Log Uniform distribution from uniform distribution by
// inverse_transform_sampling method
const
int64_t
value
=
static_cast
<
int64_t
>
(
exp
(
random
*
log_range
))
-
1
;
// Mathematically, value should be <= range_, but might not be due to some
// floating point roundoff, so we mod by range_.
return
value
%
range
;
}
__device__
float
GPULogUniformSampler
::
Probability
(
int64_t
value
,
const
float
log_range
)
const
{
// Given f(x) = 1/[(x+1) * log_range_]
// The value's probability is integral of f(x) from value to (value + 1)
return
(
log
((
value
+
2.0
)
/
(
value
+
1.0
)))
/
log_range
;
}
template
<
typename
T
>
__global__
void
SamplingCondidate
(
const
size_t
n
,
const
int
num_tries
,
const
int
range
,
const
float
log_range
,
const
int
num_true
,
const
std
::
size_t
num_samples
,
const
int64_t
*
label_data
,
int64_t
*
samples_data
,
T
*
probabilities_data
)
{
const
int
num_sampled_classes
=
num_true
+
num_samples
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
0
;
GPULogUniformSampler
sampler
;
for
(;
idx
<
n
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
col_idx
=
idx
%
num_sampled_classes
;
int
row_idx
=
idx
/
num_sampled_classes
;
if
(
col_idx
<
num_true
)
{
samples_data
[
idx
]
=
label_data
[
row_idx
*
num_true
+
col_idx
];
}
else
{
samples_data
[
idx
]
=
samples_data
[
col_idx
];
}
probabilities_data
[
idx
]
=
sampler
.
Probability
(
samples_data
[
idx
],
log_range
);
probabilities_data
[
idx
]
=
gpu_adjust_prob
(
probabilities_data
[
idx
],
num_samples
,
num_tries
);
}
}
template
<
typename
T
>
int
UniqSampler
(
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
int64_t
*
samples_data
)
{
// sample num_samles unique samples for an example, note that they are not
// all negative samples
std
::
unordered_set
<
int64_t
>
tmp_samples
;
tmp_samples
.
clear
();
int
num_tries
=
0
;
int
j
=
0
;
while
(
j
<
num_samples
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
samples_data
[
j
]
=
v
;
++
j
;
}
return
num_tries
;
}
template
<
typename
T
>
void
GPUSampleWithProb
<
T
>::
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
data
<
int64_t
>
();
T
*
probabilities_data
=
P
->
data
<
T
>
();
int
s_size
=
num_samples
;
framework
::
DDim
s_dim
{
s_size
};
Tensor
s
;
int64_t
*
s_data
=
s
.
mutable_data
<
int64_t
>
(
s_dim
,
platform
::
CPUPlace
());
math
::
LogUniformSampler
sampler
(
dict_size
,
seed
);
int
range
=
dict_size
;
float
log_range
=
log
(
range
+
1
);
int
num_tries
=
UniqSampler
<
T
>
(
sampler
,
num_samples
,
s_data
);
VLOG
(
1
)
<<
"num_tries: "
<<
num_tries
;
PADDLE_ENFORCE
(
cudaMemcpy
(
samples_data
+
num_true
,
s_data
,
sizeof
(
int64_t
)
*
num_samples
,
cudaMemcpyHostToDevice
));
int
threads
=
512
;
const
size_t
size
=
batch_size
*
num_sampled_classes
;
int
grid
=
(
batch_size
*
num_sampled_classes
+
threads
-
1
)
/
threads
;
SamplingCondidate
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
size
,
num_tries
,
range
,
log_range
,
num_true
,
num_samples
,
label_data
,
samples_data
,
probabilities_data
);
}
template
class
GPUSampleWithProb
<
float
>;
template
class
GPUSampleWithProb
<
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sample_prob.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
/* UNDERSTAND: utility function to adjust probability for unique sampling,
return whatever as it is if not using unique samping */
template
<
typename
T
>
static
T
adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
SampleWithProb
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
mutable_data
<
int64_t
>
(
ret_dim
,
context
.
GetPlace
());
T
*
probabilities_data
=
P
->
mutable_data
<
T
>
(
ret_dim
,
context
.
GetPlace
());
// temp sets for unique sampling
std
::
unordered_set
<
int64_t
>
tmp_samples
;
int
j
=
0
;
// column index
// add true labels, not that efficient
while
(
j
<
num_true
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
auto
v
=
label_data
[
i
*
num_true
+
j
];
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
sampler
.
Probability
(
v
);
}
++
j
;
}
// sample num_samles unique samples for an example, note that they are not
// all negative samples
tmp_samples
.
clear
();
int
num_tries
=
0
;
while
(
j
<
num_sampled_classes
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
auto
p
=
sampler
.
Probability
(
v
);
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
p
;
}
++
j
;
}
// compute Q(y|x), because of unique sampling, probabilities need to be
// adjusted
for
(
int
k
=
0
;
k
<
num_sampled_classes
;
++
k
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
k
;
probabilities_data
[
samples_index
]
=
adjust_prob
(
probabilities_data
[
samples_index
],
num_samples
,
num_tries
);
}
}
}
};
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
class
GPUSampleWithProb
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
);
};
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -96,8 +96,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
std
::
vector
<
int
>
src_tz
=
framework
::
vectorize2int
(
x
->
dims
());
auto
src_format
=
src_tz
.
size
()
==
2
?
mkldnn
::
memory
::
format
::
nc
:
x
->
format
();
auto
src_format
=
x
->
format
();
const
std
::
string
key
=
gethash
(
src_tz
,
algorithm
);
const
std
::
string
key_src_data
=
...
...
@@ -127,10 +126,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
if
(
p_fwd
==
nullptr
)
{
// create mkldnn memory for input X
auto
src_md
=
platform
::
MKLDNNMemDesc
(
src_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
src_format
);
auto
src_memory
=
std
::
shared_ptr
<
memory
>
(
new
memory
(
{
src_md
,
mkldnn_engine
}
,
to_void_cast
(
x_data
)));
new
memory
(
x
->
get_mkldnn_prim_desc
()
,
to_void_cast
(
x_data
)));
// save src_memory to be referred in backward path
dev_ctx
.
SetBlob
(
key_src_mem
,
src_memory
);
...
...
@@ -177,8 +174,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
pipeline
.
push_back
(
*
p_fwd
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory
));
y
->
set_mkldnn_prim_desc
(
dst_memory
->
get_primitive_desc
());
}
template
<
typename
T
>
...
...
@@ -196,9 +192,6 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std
::
vector
<
int
>
diff_dst_tz
=
framework
::
vectorize2int
(
diff_y
->
dims
());
auto
diff_y_format
=
diff_dst_tz
.
size
()
==
2
?
mkldnn
::
memory
::
format
::
nc
:
diff_y
->
format
();
const
std
::
string
key
=
gethash
(
diff_dst_tz
,
algorithm
);
const
std
::
string
key_src_data
=
key
+
ctx
.
op
().
Input
(
"Out"
)
+
"@eltwise_fwd_src_data"
;
...
...
@@ -210,8 +203,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
key
+
std
::
to_string
(
*
p_src_layout
)
+
"@eltwise_fwd_src_mem"
;
const
std
::
string
key_fwd_pd
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"@eltwise_fwd_pd"
;
const
std
::
string
key_with_layouts
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"-"
+
std
::
to_string
(
diff_y_format
);
const
std
::
string
key_with_layouts
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"-"
+
std
::
to_string
(
diff_y
->
format
()
);
const
std
::
string
key_diff_src_mem
=
key_with_layouts
+
"@eltwise_diff_src_mem"
;
const
std
::
string
key_diff_dst_mem
=
...
...
@@ -225,7 +218,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
"Fail to find src_memory in device context"
);
src_memory
->
set_data_handle
(
*
p_src_data
.
get
()
);
src_memory
->
set_data_handle
(
*
p_src_data
);
std
::
shared_ptr
<
memory
>
diff_src_memory
;
...
...
@@ -234,10 +227,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
if
(
p_grad
==
nullptr
)
{
// create mkldnn memory for input diff_y
auto
diff_dst_md
=
platform
::
MKLDNNMemDesc
(
diff_dst_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
diff_y_format
);
auto
diff_dst_memory
=
std
::
shared_ptr
<
memory
>
(
new
memory
(
{
diff_dst_md
,
mkldnn_engine
}
,
to_void_cast
(
diff_y_data
)));
new
memory
(
diff_y
->
get_mkldnn_prim_desc
()
,
to_void_cast
(
diff_y_data
)));
dev_ctx
.
SetBlob
(
key_diff_dst_mem
,
diff_dst_memory
);
// retrieve eltwise primitive desc from device context
...
...
@@ -281,8 +272,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
pipeline
.
push_back
(
*
p_grad
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory
));
diff_x
->
set_mkldnn_prim_desc
(
diff_src_memory
->
get_primitive_desc
());
}
template
<
typename
T
,
mkldnn
::
algorithm
algorithm
>
...
...
paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -206,17 +206,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if
(
fuse_with_relu
)
flags
|=
mkldnn
::
fuse_bn_relu
;
// create mkldnn memory from input x tensor
mkldnn
::
memory
::
format
input_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
// keys for backward pass
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
global_stats
,
input_format
,
src_tz
,
epsilon
,
flags
,
global_stats
,
x
->
format
()
,
ctx
.
op
().
Output
(
"SavedMean"
));
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
input_format
);
auto
user_src_md
=
x
->
get_mkldnn_prim_desc
().
desc
();
// create primitive descriptor for batch norm forward
using
bn_fwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_forward
>
;
...
...
@@ -230,8 +227,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
BatchNormMKLDNNHandler
handler
(
batch_norm_fwd_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
auto
src_memory
=
handler
.
AcquireSrcMemory
(
user_src_md
,
to_void_cast
(
x_data
));
auto
src_memory
=
handler
.
AcquireSrcMemory
(
x
->
get_mkldnn_prim_desc
(),
to_void_cast
(
x_data
));
// crate mkldnn memory for weights(scale/shift)
auto
scaleshift_memory
=
...
...
@@ -265,8 +262,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
variance_memory
,
false
);
}
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
platform
::
GetMKLDNNFormat
(
*
dst_memory
));
y
->
set_mkldnn_prim_desc
(
dst_memory
->
get_primitive_desc
());
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
batch_norm_p
);
...
...
@@ -336,9 +332,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using
bn_bwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_backward
>
;
mkldnn
::
memory
::
format
dst_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
diff_y
->
format
());
mkldnn
::
memory
::
format
input_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
...
...
@@ -346,14 +339,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// keys from forward pass
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
false
,
input_format
,
src_tz
,
epsilon
,
flags
,
false
,
x
->
format
()
,
ctx
.
op
().
Input
(
"SavedMean"
));
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
// keys for primitives reuse
const
std
::
string
key_with_hash
=
key
+
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
false
,
input_format
);
x
->
format
()
);
const
std
::
string
key_batch_norm_bwd_p
=
key_with_hash
+
"@batch_norm_bwd_p"
;
const
std
::
string
key_batch_norm_src_mem_p
=
...
...
@@ -373,9 +366,8 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
primitive
reorder_diff_dst
;
bool
is_diff_dst_reordered
=
false
;
auto
user_diff_dst_memory
=
memory
(
{{{
diff_dst_tz
},
memory
::
data_type
::
f32
,
dst_format
},
mkldnn_engine
},
to_void_cast
(
diff_y_data
));
auto
user_diff_dst_memory
=
memory
(
diff_y
->
get_mkldnn_prim_desc
(),
to_void_cast
(
diff_y_data
));
// MKLDNN requires a single piece of memory for scale and shift/bias data
const
size_t
scaleshift_size
=
2
*
ic
;
...
...
@@ -459,10 +451,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx
.
SetBlob
(
key_batch_norm_diff_dst_mem_p
,
diff_dst_memory
);
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
->
get_primitive_desc
()
.
desc
()
.
data
.
format
);
diff_x
->
set_mkldnn_prim_desc
(
diff_src_memory
->
get_primitive_desc
());
}
else
{
// primitives already exist
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_src_mem_p
,
to_void_cast
(
x_data
));
...
...
@@ -487,10 +476,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
}
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
->
get_primitive_desc
()
.
desc
()
.
data
.
format
);
diff_x
->
set_mkldnn_prim_desc
(
diff_src_memory
->
get_primitive_desc
());
}
// execute optional reorder and batch_norm backward primitive
...
...
paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -47,11 +47,6 @@ static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
return
mem_prim_desc
;
}
static
mkldnn
::
memory
::
format
GetDstMemFormat
(
const
concat
::
primitive_desc
&
concat_pd
)
{
return
(
memory
::
format
)
concat_pd
.
dst_primitive_desc
().
desc
().
data
.
format
;
}
static
platform
::
CPUPlace
GetCpuPlace
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
{
auto
place
=
ctx
.
GetPlace
();
...
...
@@ -139,8 +134,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
concat
=
prim_creator
.
CreateConcatPrimitive
(
concat_pd
,
output
,
place
);
stream
(
stream
::
kind
::
eager
).
submit
({
concat
}).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetDstMemFormat
(
concat_pd
));
output
->
set_mkldnn_prim_desc
(
concat_pd
.
dst_primitive_desc
());
}
};
}
// namespace operators
...
...
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
&&
input
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input tensor"
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
&&
filter
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Filter tensor"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
||
input
->
dims
().
size
()
==
5
,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
||
filter
->
dims
().
size
()
==
5
,
...
...
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
primitive
>
pipeline
;
auto
src_format
=
input
->
format
();
// For convolution with groups we need to recreate primitive descriptor
// as Paddle tensor is not having group dims while mkldnn treats
// group as another dimensions
mkldnn
::
memory
::
primitive_desc
user_weights_mpd
=
filter
->
get_mkldnn_prim_desc
();
if
(
g
>
1
)
{
mkldnn
::
memory
::
format
weights_format
=
GetWeightsFormat
(
filter
->
format
(),
g
,
is_conv3d
);
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
src_format
);
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
(
{
weights_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
weights_format
);
user_weights_mpd
=
mkldnn
::
memory
::
primitive_desc
(
user_weights_md
,
mkldnn_engine
);
}
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
...
...
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
chosen_memory_format
=
platform
::
data_format_to_memory_format
(
data_format
);
weights_format
=
mkldnn
::
memory
::
format
::
any
;
mkldnn
::
memory
::
format
weights_format
=
mkldnn
::
memory
::
format
::
any
;
// Check the format for user's special output
if
(
chosen_memory_format
!=
mkldnn
::
memory
::
format
::
any
)
{
if
(
is_conv3d
)
{
...
...
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
user_src_md
,
to_void_cast
<
T
>
(
input_data
));
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
input
->
get_mkldnn_prim_desc
()
,
to_void_cast
<
T
>
(
input_data
));
auto
user_weights_memory_p
=
handler
.
AcquireWeightsMemory
(
user_weights_md
,
to_void_cast
<
T
>
(
filter_data
));
user_weights_m
p
d
,
to_void_cast
<
T
>
(
filter_data
));
// create reorder primitive if the input format is not the preferred one
auto
src_memory_p
=
...
...
@@ -281,8 +282,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
conv_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
output
->
set_mkldnn_prim_desc
(
dst_memory_p
->
get_primitive_desc
());
}
void
ComputeINT8
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
...
...
@@ -947,8 +947,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed
pipeline
.
push_back
(
*
conv_bwd_weights_p
);
filter_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
filter_grad
->
set_
format
(
GetMKLDNNFormat
(
*
diff_weights_memory_p
)
);
auto
filter_grad_mpd
=
diff_weights_memory_p
->
get_primitive_desc
(
);
filter_grad
->
set_
mkldnn_prim_desc
(
filter_grad_mpd
);
}
if
(
input_grad
)
{
...
...
@@ -971,8 +971,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
conv_bwd_data_p
);
input_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
input_grad
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory_p
));
input_grad
->
set_mkldnn_prim_desc
(
diff_src_memory_p
->
get_primitive_desc
());
}
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
...
...
@@ -990,12 +989,12 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
U8
,
ops
::
kConvMKLDNN
FP32
,
ops
::
kConvMKLDNN
INT8
,
ops
::
ConvMKLDNNOpKernel
<
uint8_t
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
S8
,
ops
::
kConvMKLDNN
FP32
,
ops
::
kConvMKLDNN
INT8
,
ops
::
ConvMKLDNNOpKernel
<
int8_t
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d_grad
,
MKLDNN
,
...
...
paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -221,8 +221,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
conv_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
platform
::
GetMKLDNNFormat
(
*
dst_memory_p
));
output
->
set_mkldnn_prim_desc
(
dst_memory_p
->
get_primitive_desc
());
}
private:
...
...
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor
->
set_layout
(
DataLayout
::
kMKLDNN
);
tensor
->
set_format
(
mkldnn
::
memory
::
format
::
oihw
);
// TODO(jczaja): Remove this hack after checking performance on block layout
auto
tensor_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
tensor
->
dims
()),
mkldnn
::
memory
::
format
::
oihw
);
tensor
->
set_mkldnn_prim_desc
(
tensor_mem_pd
);
}
};
}
// namespace operators
...
...
paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -81,10 +81,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
e_mid
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
mid
);
e_mid
=
e_mid
.
constant
(
k
);
auto
dims
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
src_md
=
paddle
::
platform
::
MKLDNNMemDesc
(
dims
,
mkldnn
::
memory
::
data_type
::
f32
,
x
->
format
());
auto
src_md
=
x
->
get_mkldnn_prim_desc
().
desc
();
auto
forward_desc
=
mkldnn
::
lrn_forward
::
desc
{
mkldnn
::
prop_kind
::
forward
,
mkldnn
::
lrn_across_channels
,
...
...
@@ -94,7 +91,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
beta
,
k
};
auto
src_memory_pd
=
mkldnn
::
memory
::
primitive_desc
{
src_md
,
mkldnn_engine
}
;
auto
src_memory_pd
=
x
->
get_mkldnn_prim_desc
()
;
if
(
!
is_test
)
{
const
std
::
string
key
=
ctx
.
op
().
Output
(
"Out"
);
...
...
@@ -111,16 +108,15 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory
->
set_data_handle
(
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
)));
auto
dst_memory
=
mkldnn
::
memory
(
forward_pd
->
dst_primitive_desc
(),
static_cast
<
void
*>
(
output_data
));
auto
dst_memory_pd
=
forward_pd
->
dst_primitive_desc
();
auto
dst_memory
=
mkldnn
::
memory
(
dst_memory_pd
,
static_cast
<
void
*>
(
output_data
));
auto
workspace_memory
=
insert_to_context
<
mkldnn
::
memory
>
(
key_workspace_memory
,
dev_ctx
,
forward_pd
->
workspace_primitive_desc
());
run_primitive
(
*
forward_pd
,
*
src_memory
,
*
workspace_memory
,
dst_memory
);
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
out
->
set_format
(
platform
::
GetMKLDNNFormat
(
dst_memory
));
out
->
set_mkldnn_prim_desc
(
dst_memory_pd
);
}
else
{
auto
forward_pd
=
mkldnn
::
lrn_forward
::
primitive_desc
{
forward_desc
,
mkldnn_engine
};
...
...
@@ -128,13 +124,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory_pd
,
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
))};
auto
workspace_memory
=
mkldnn
::
memory
{
forward_pd
.
workspace_primitive_desc
()};
auto
dst_memory_pd
=
forward_pd
.
dst_primitive_desc
();
auto
dst_memory
=
mkldnn
::
memory
(
forward_pd
.
dst_primitive_desc
(),
static_cast
<
void
*>
(
output_data
));
run_primitive
(
forward_pd
,
src_memory
,
workspace_memory
,
dst_memory
);
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
out
->
set_format
(
platform
::
GetMKLDNNFormat
(
dst_memory
));
out
->
set_mkldnn_prim_desc
(
dst_memory_pd
);
}
}
};
...
...
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/operators/pool_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -29,23 +30,23 @@ using mkldnn::stream;
using
platform
::
to_void_cast
;
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
gethash
(
const
memory
::
dims
&
input_dims
,
std
::
string
CreateKey
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
,
const
memory
::
dims
&
input_dims
,
const
std
::
string
&
pooling_type
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
memory
::
data_type
&
dt
,
const
std
::
string
&
suffix
)
{
auto
dims2str
=
[](
const
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
dstr
+=
std
::
to_string
(
operand_dims
[
i
])
+
"-"
;
}
return
dstr
;
}
;
return
dims2str
(
input_dims
)
+
dims2str
(
ksize
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
std
::
to_string
(
dt
)
+
pooling_type
+
suffix
;
const
memory
::
data_type
&
dt
,
const
std
::
string
&
suffix
)
{
std
::
string
key
;
key
.
reserve
(
platform
::
MKLDNNHandler
::
MaxKeyLength
);
platform
::
MKLDNNHandler
::
AppendKeyDims
(
&
key
,
input_dims
)
;
platform
::
MKLDNNHandler
::
AppendKey
(
&
key
,
pooling_type
);
platform
::
MKLDNNHandler
::
AppendKeyVec
(
&
key
,
ksize
)
;
platform
::
MKLDNNHandler
::
AppendKeyVec
(
&
key
,
strides
);
platform
::
MKLDNNHandler
::
AppendKeyVec
(
&
key
,
paddings
)
;
platform
::
MKLDNNHandler
::
AppendKey
(
&
key
,
std
::
to_string
(
dt
))
;
platform
::
MKLDNNHandler
::
AppendKey
(
&
key
,
suffix
);
return
key
;
}
static
inline
int
ComputeCeiledOutput
(
int
input_size
,
int
kernel_size
,
...
...
@@ -114,7 +115,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn
::
memory
::
data_type
dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
const
std
::
string
key
=
gethash
(
src_tz
,
pooling_type
,
ksize
,
strides
,
const
std
::
string
key
=
CreateKey
(
ctx
,
src_tz
,
pooling_type
,
ksize
,
strides
,
paddings
,
dt
,
ctx
.
op
().
Output
(
"Out"
));
const
std
::
string
key_pool_p
=
key
+
"@pool_p"
;
const
std
::
string
key_pool_pd
=
key
+
"@pool_pd"
;
...
...
@@ -198,7 +199,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
// push primitive to stream and wait until it's executed
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
(
pool_p
.
get
())
};
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
pool_p
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
...
@@ -294,7 +295,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Get an unique name from "argument" name of "Out" variable
// This name will be used as key when referring info from device context
const
std
::
string
key
=
gethash
(
diff_src_tz
,
pooling_type
,
ksize
,
strides
,
paddings
,
CreateKey
(
ctx
,
diff_src_tz
,
pooling_type
,
ksize
,
strides
,
paddings
,
memory
::
data_type
::
f32
,
ctx
.
op
().
Input
(
"Out"
));
const
std
::
string
key_pool_bwd_p
=
key
+
"@pool_bwd_p"
;
const
std
::
string
key_pool_diff_src_mem_p
=
key
+
"@pool_diff_src_mem_p"
;
...
...
@@ -367,8 +368,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx
.
SetBlob
(
key_pool_diff_dst_mem_p
,
diff_dst_memory
);
pool_bwd_p
=
std
::
make_shared
<
pooling_backward
>
(
pool_bwd_pd
,
*
(
diff_dst_memory
.
get
()),
*
workspace_memory
,
*
(
diff_src_memory
));
pool_bwd_pd
,
*
diff_dst_memory
,
*
workspace_memory
,
*
diff_src_memory
);
dev_ctx
.
SetBlob
(
key_pool_bwd_p
,
pool_bwd_p
);
}
else
{
...
...
@@ -404,7 +404,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
if
(
is_diff_dst_reordered
)
{
pipeline
.
push_back
(
reorder_diff_dst
);
}
pipeline
.
push_back
(
*
(
pool_bwd_p
.
get
())
);
pipeline
.
push_back
(
*
pool_bwd_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
in_x_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
...
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax primitive in device context"
);
if
(
softmax_p
==
nullptr
)
{
softmax_p
=
std
::
make_shared
<
mkldnn
::
softmax_forward
>
(
*
(
softmax_pd_
.
get
()),
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
softmax_pd_
,
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
(
static_cast
<
mkldnn
::
memory
*>
(
dst_memory_p
.
get
())));
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_p
);
}
else
{
...
...
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax backward primitive in device context"
);
if
(
softmax_bwd_p
==
nullptr
)
{
softmax_bwd_p
=
std
::
make_shared
<
mkldnn
::
softmax_backward
>
(
*
softmax_bwd_pd_
,
*
(
dst_memory_p
.
get
()),
*
(
diff_dst_memory_p
.
get
())
,
*
(
diff_src_memory_p
.
get
())
);
*
softmax_bwd_pd_
,
*
dst_memory_p
,
*
diff_dst_memory_p
,
*
diff_src_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_bwd_p
);
}
else
{
is_reusing_
=
true
;
...
...
@@ -159,6 +158,14 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
auto
softmax_p
=
handler
.
AcquireSoftmax
(
softmax_dst_memory_p
,
softmax_src_memory_p
);
// We cannot use softmax_dst_memory_p to get prim desc as
// it contains flattened dims (2D) while output tensor can
// have 2,3,4+ dims
auto
output_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
output
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
output
->
set_mkldnn_prim_desc
(
output_mem_pd
);
std
::
vector
<
primitive
>
pipeline
{
*
(
static_cast
<
softmax_forward
::
primitive
*>
(
softmax_p
.
get
()))};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
...
...
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -79,15 +79,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory
::
format
input_format
=
input0
.
format
();
if
(
src_tz
.
size
()
==
1
&&
(
input_format
==
memory
::
format
::
nchw
||
input_format
==
memory
::
format
::
nhwc
))
{
input_format
=
memory
::
format
::
x
;
}
if
(
src_tz
.
size
()
==
2
&&
(
input_format
==
memory
::
format
::
nchw
||
input_format
==
memory
::
format
::
nhwc
))
{
input_format
=
memory
::
format
::
nc
;
}
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
PADDLE_ENFORCE
(
in_vars
[
i
]
->
IsType
<
LoDTensor
>
(),
"all inputs must be all LoDTensors"
);
...
...
@@ -115,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory
::
desc
(
dst_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
any
);
auto
sum_pd
=
sum
::
primitive_desc
(
dst_md
,
scales
,
srcs_mpd
);
auto
dst_mem_pd
=
sum_pd
.
dst_primitive_desc
();
std
::
shared_ptr
<
memory
>
dst_mem
;
if
(
in_place
)
{
dst_mem
.
reset
(
new
memory
(
sum_pd
.
dst_primitive_desc
()
));
dst_mem
.
reset
(
new
memory
(
dst_mem_pd
));
}
else
{
dst_mem
.
reset
(
new
memory
(
sum_pd
.
dst_primitive_desc
()
,
output_data
));
dst_mem
.
reset
(
new
memory
(
dst_mem_pd
,
output_data
));
}
std
::
vector
<
mkldnn
::
primitive
::
at
>
inputs
;
for
(
size_t
i
=
0
;
i
<
srcs_mem
.
size
();
++
i
)
{
...
...
@@ -145,107 +136,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if
(
in_place
)
pipeline
.
push_back
(
reorder_prim
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
output_format
);
}
else
if
(
out_var
->
IsType
<
framework
::
SelectedRows
>
())
{
// TODO(@mozga-intel) Add MKLDNN SelectedRows support
std
::
unique_ptr
<
framework
::
SelectedRows
>
in0
;
if
(
in_place
)
{
// If is in_place, we store the input[0] to in0
auto
&
in_sel0
=
in_vars
[
0
]
->
Get
<
SelectedRows
>
();
auto
&
rows
=
in_sel0
.
rows
();
in0
.
reset
(
new
framework
::
SelectedRows
(
rows
,
in_sel0
.
height
()));
in0
->
mutable_value
()
->
ShareDataWith
(
in_sel0
.
value
());
}
auto
get_selected_row
=
[
&
](
size_t
i
)
->
const
SelectedRows
&
{
if
(
i
==
0
&&
in0
)
{
return
*
in0
.
get
();
}
else
{
return
in_vars
[
i
]
->
Get
<
SelectedRows
>
();
}
};
auto
*
out
=
ctx
.
Output
<
SelectedRows
>
(
"Out"
);
out
->
mutable_rows
()
->
clear
();
auto
*
out_value
=
out
->
mutable_value
();
// Runtime InferShape
size_t
first_dim
=
0
;
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
auto
&
sel_row
=
get_selected_row
(
i
);
first_dim
+=
sel_row
.
rows
().
size
();
}
std
::
vector
<
int64_t
>
in_dim
;
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
auto
&
sel_row
=
get_selected_row
(
i
);
if
(
sel_row
.
rows
().
size
()
>
0
)
{
in_dim
=
framework
::
vectorize
(
sel_row
.
value
().
dims
());
break
;
}
}
if
(
in_dim
.
empty
())
{
VLOG
(
3
)
<<
"WARNING: all the inputs are empty"
;
in_dim
=
framework
::
vectorize
(
get_selected_row
(
N
-
1
).
value
().
dims
());
}
else
{
in_dim
[
0
]
=
static_cast
<
int64_t
>
(
first_dim
);
}
in_dim
[
0
]
=
static_cast
<
int64_t
>
(
first_dim
);
out_value
->
Resize
(
framework
::
make_ddim
(
in_dim
));
out_value
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// if all the input sparse vars are empty, no need to
// merge these vars.
if
(
first_dim
==
0UL
)
{
return
;
}
math
::
SelectedRowsAddTo
<
CPUDeviceContext
,
T
>
functor
;
int64_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
auto
&
sel_row
=
get_selected_row
(
i
);
if
(
sel_row
.
rows
().
size
()
==
0
)
{
continue
;
}
PADDLE_ENFORCE_EQ
(
out
->
height
(),
sel_row
.
height
());
functor
(
ctx
.
template
device_context
<
CPUDeviceContext
>(),
sel_row
,
offset
,
out
);
offset
+=
sel_row
.
value
().
numel
();
}
}
else
if
(
out_var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
auto
&
out_array
=
*
out_var
->
GetMutable
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
in_place
?
1
:
0
;
i
<
in_vars
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensorArray
>
(),
"Only support all inputs are TensorArray"
);
auto
&
in_array
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
in_array
.
size
();
++
i
)
{
if
(
in_array
[
i
].
numel
()
!=
0
)
{
if
(
i
>=
out_array
.
size
())
{
out_array
.
resize
(
i
+
1
);
}
if
(
out_array
[
i
].
numel
()
==
0
)
{
framework
::
TensorCopy
(
in_array
[
i
],
in_array
[
i
].
place
(),
ctx
.
device_context
(),
&
out_array
[
i
]);
out_array
[
i
].
set_lod
(
in_array
[
i
].
lod
());
}
else
{
PADDLE_ENFORCE
(
out_array
[
i
].
lod
()
==
in_array
[
i
].
lod
());
auto
in
=
EigenVector
<
T
>::
Flatten
(
in_array
[
i
]);
auto
result
=
EigenVector
<
T
>::
Flatten
(
out_array
[
i
]);
result
.
device
(
*
ctx
.
template
device_context
<
MKLDNNDeviceContext
>()
.
eigen_device
())
=
result
+
in
;
}
}
}
}
}
else
{
PADDLE_THROW
(
"Unexpected branch, output variable type is %s"
,
framework
::
ToTypeName
(
out_var
->
Type
()));
output
->
set_mkldnn_prim_desc
(
dst_mem_pd
);
}
else
{
// Fallback to naive version
// TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
SumKernel
<
CPUDeviceContext
,
T
>
reference_kernel
;
reference_kernel
.
Compute
(
ctx
);
}
}
};
...
...
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
浏览文件 @
c4187dbd
...
...
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
input
->
format
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
input
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
output
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
...
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
output_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
output
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
output
->
set_mkldnn_prim_desc
(
output_mem_pd
);
}
};
...
...
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
platform
::
TransposeMKLDNNHandler
handler
(
nchw_tz
,
reversed_axis
,
dev_ctx
,
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
out_grad
->
format
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
out_grad
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
x_grad
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
...
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
x_grad_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
x_grad
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
x_grad
->
set_mkldnn_prim_desc
(
x_grad_mem_pd
);
}
};
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
c4187dbd
...
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."
);
// TODO(Chengduo): Add checker.
// (Currently,
// TypedAttrChecker don't support vector type.)
AddAttr
<
bool
>
(
"global_pooling"
,
AddAttr
<
bool
>
(
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings will be ignored."
)
"If global_pooling = true, kernel
size and paddings will be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1, 1}), strides(height, "
...
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings"
,
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator."
"If global_pooling = true, paddings and ksize will be ignored."
)
"If global_pooling = true, paddings and k
ernel
size will be ignored."
)
.
SetDefault
({
0
,
0
});
AddAttr
<
bool
>
(
"exclusive"
,
...
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
.
SetDefault
(
false
);
...
...
@@ -262,28 +263,37 @@ Example:
For exclusive = false:
$$
hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
For exclusive = true:
$$
hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1])
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"
);
}
...
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr
<
bool
>
(
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings wille
be ignored."
)
"If global_pooling = true, k
ernel size and paddings will
be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
...
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
.
SetDefault
(
false
);
...
...
@@ -393,45 +403,65 @@ Example:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = false:
$$
dstart = i * strides[0] - paddings[0]
$$
$$
dend = dstart + ksize[0]
$$
$$
hstart = j * strides[1] - paddings[1]
$$
$$
hend = hstart + ksize[1]
$$
$$
wstart = k * strides[2] - paddings[2]
$$
$$
wend = wstart + ksize[2]
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
hstart = max(0, j * strides[1] - paddings[1])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
dstart = floor(i * D_{in} / D_{out})
dend = ceil((i + 1) * D_{in} / D_{out})
hstart = floor(j * H_{in} / H_{out})
hend = ceil((j + 1) * H_{in} / H_{out})
wstart = floor(k * W_{in} / W_{out})
wend = ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
...
...
paddle/fluid/operators/sample_logits_op.cc
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/fluid/operators/math/sample_prob.h"
namespace
paddle
{
namespace
operators
{
class
SampleLogitsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Logits"
,
"(Tensor, default: Tensor<float>), The unscaled log probabilities "
"which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number."
);
AddInput
(
"Labels"
,
"(Tensor) The ground truth which is a 2-D tensor. Labels is a "
"Tensor<int64> with shape [N x NT], where NT is the number of"
"true labels for each example."
);
AddInput
(
"CustomizedSamples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S],"
" where N is the batch size, NT is the number of true labels "
"and S is the number of negtive sample for each example."
"The first NT elements of each row should be the same with true "
"labels, "
"followed by S custom negtive samples. This tensor"
"is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddInput
(
"CustomizedProbabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The tensor has the same shape with CustomSamples,"
"and each element represents probability of element in CustomSamples. "
"This "
"tensor is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddOutput
(
"Samples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S]."
"The outputs value of sampler, including NT true lables and S "
"negetive samples "
"for each example. This will be used in"
"backward calculation."
)
.
AsIntermediate
();
AddOutput
(
"Probabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The probabilites of sampled positive and negtive labels."
)
.
AsIntermediate
();
AddOutput
(
"SampledLogits"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape"
"[N, NT + S]. The outputs value of sampled logits, which will be"
"used in backward propagation."
)
.
AsIntermediate
();
AddOutput
(
"SampledLabels"
,
"(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
"with shape [N, NT]. The tonsor contains hard labels as input to "
" softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
" of Sampels are positive lables."
);
AddAttr
<
bool
>
(
"use_customized_samples"
,
"An indicator whether to use customized samples with probabilities, if "
"True"
"the operator will use customized samples and customized probabilities"
"otherwise, the operator will generate them by itself."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"uniq"
,
"An indicator whether to sample non-repetitive negtive labels, if True"
"the operator will sample negtive labels without replacement."
"Otherwise, the operator will sample negtive labels with replacement."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"remove_accidental_hits"
,
"An indicator whether to remove accidental hits when samples hits true"
"labels, the removal is implemented by subtracting the corresponding"
"logits by float_max to subpress their softmax to be zero."
)
.
SetDefault
(
true
);
AddAttr
<
int
>
(
"num_samples"
,
"The number of negative samples."
);
AddAttr
<
int
>
(
"seed"
,
"Random seed for generating samples"
).
SetDefault
(
0
);
AddComment
(
R"DOC(
"""
Computes sampled output training logits and labels suitable for implementing
sampled softmax.
"""
)DOC"
);
}
};
class
SampleLogitsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Samples"
),
"Output(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Probabilities"
),
"Output(Probabilities) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLogits"
),
"Output(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLabels"
),
"Output(SampledLabels) should be not null."
);
auto
logits_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
labels_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
logits_dims
.
size
(),
2UL
,
"The logits of softmax_with_cross_entropy should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
labels_dims
.
size
(),
2UL
,
"The labels should be a 2-D tensor."
);
const
int
num_samples
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_samples"
);
const
int
num_sampled_classes
=
labels_dims
[
1
]
+
num_samples
;
ctx
->
SetOutputDim
(
"Samples"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"Probabilities"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLogits"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLabels"
,
{
logits_dims
[
0
],
labels_dims
[
1
]});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Logits"
));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: InferShape for Grad
class
SampleLogitsOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Samples"
),
"Input(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SampledLogits"
),
"Input(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"SampledLogits"
)),
"Input(SampledLogits@Grad) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Logits"
)),
"Output(Logits@Grad) should be not null."
);
auto
logit_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
label_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
2UL
,
"The label should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
logit_dims
.
size
(),
2UL
,
"The logits should be a 2-D tensor."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Logits"
),
ctx
->
GetInputDim
(
"Logits"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
framework
::
GradVarName
(
"SampledLogits"
)));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: what's the rule for making a GradMaker TODO
class
SampleLogitsGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
grad_op
=
new
framework
::
OpDesc
();
grad_op
->
SetType
(
"sample_logits_grad"
);
grad_op
->
SetInput
(
"Logits"
,
Input
(
"Logits"
));
grad_op
->
SetInput
(
"Labels"
,
Input
(
"Labels"
));
grad_op
->
SetInput
(
"Samples"
,
Output
(
"Samples"
));
grad_op
->
SetInput
(
"SampledLogits"
,
Output
(
"SampledLogits"
));
grad_op
->
SetInput
(
framework
::
GradVarName
(
"SampledLogits"
),
OutputGrad
(
"SampledLogits"
));
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
InputGrad
(
"Logits"
));
grad_op
->
SetAttrMap
(
Attrs
());
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
grad_op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sample_logits
,
ops
::
SampleLogitsOp
,
ops
::
SampleLogitsOpMaker
,
ops
::
SampleLogitsGradMaker
);
REGISTER_OPERATOR
(
sample_logits_grad
,
ops
::
SampleLogitsOpGrad
);
REGISTER_OP_CPU_KERNEL
(
sample_logits
,
ops
::
SampleLogitsKernel
<
float
>
,
ops
::
SampleLogitsKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradKernel
<
float
>
,
ops
::
SampleLogitsGradKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.cu
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
namespace
paddle
{
namespace
operators
{
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
__global__
void
GPUTakeAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
const
T
*
p_array
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
auto
array_index
=
p_index
[
idx
];
p_value
[
idx
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
__global__
void
GPUPutAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
T
*
p_array
,
const
int64_t
*
p_index
,
const
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
// size == batch_size
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
;
for
(
int
j
=
0
;
j
<
idx_slice_size
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
idx_slice_size
+
j
];
}
}
}
// UNDERSTAND: set label as 0,1,...,num_true-1
template
<
typename
T
>
__global__
void
GPUSetLabel
(
size_t
size
,
const
int
num_true
,
int64_t
*
p_array
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
p_array
[
idx
]
=
idx
%
num_true
;
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
__global__
void
gpu_compute_remove_accidental_hits
(
const
int
size
,
const
int
num_true
,
const
int
idx_slice_size
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
if
(
idx
%
idx_slice_size
<
num_true
)
continue
;
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
const
auto
true_idx
=
i
*
idx_slice_size
+
j
;
if
(
p_index
[
true_idx
]
==
p_index
[
idx
])
{
p_value
[
idx
]
-=
1e20
;
break
;
}
}
}
}
template
<
typename
T
>
class
SampleLogitsCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
VLOG
(
3
)
<<
"Enter SampleLogitsCUDAKernel"
;
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
uniq
=
context
.
Attr
<
bool
>
(
"uniq"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
cuda_device_context
();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
sampled_logits
,
static_cast
<
T
>
(
0
));
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
int
threads
=
512
;
size_t
size
=
batch_size
*
num_true
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUSetLabel
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
sampled_labels_data
);
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
GPUSampleWithProb
<
T
>
();
sampler_with_prob
(
context
.
cuda_device_context
(),
seed
,
num_classes
,
uniq
,
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
const
auto
num_take
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
const
T
*
p_array
=
logits
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
T
*
p_value
=
sampled_logits
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
size
=
batch_size
*
num_take
;
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUTakeAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
if
(
remove_accidental_hits
)
{
const
size_t
size
=
batch_size
*
(
num_true
+
num_samples
);
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
gpu_compute_remove_accidental_hits
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
idx_slice_size
,
p_index
,
p_value
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
cuda_device_context
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
const
auto
batch_size
=
samples
->
dims
()[
0
];
const
auto
num_put
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits_grad
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
T
*
p_array
=
logits_grad
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
const
T
*
p_value
=
sampled_logits_grad
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
int
threads
=
128
;
const
size_t
size
=
batch_size
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUPutAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sample_logits
,
ops
::
SampleLogitsCUDAKernel
<
float
>
,
ops
::
SampleLogitsCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradCUDAKernel
<
float
>
,
ops
::
SampleLogitsGradCUDAKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
TolerableValue
{
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
PADDLE_ASSERT
(
std
::
is_floating_point
<
T
>::
value
);
const
T
kApproInf
=
1e20
;
if
(
x
==
INFINITY
)
return
kApproInf
;
if
(
x
==
-
INFINITY
)
return
-
kApproInf
;
return
x
;
}
};
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
static
void
CPUTakeAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
array
,
const
framework
::
Tensor
&
index
,
framework
::
Tensor
*
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
.
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
.
dims
()[
0
]
&&
index
.
dims
()
==
value
->
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_take
=
index
.
dims
()[
1
];
const
auto
array_dims
=
array
.
dims
();
const
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
const
T
*
p_array
=
array
.
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
T
*
p_value
=
value
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_take
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_value
[
i
*
value_slice_size
+
j
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
static
void
CPUPutAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
array
,
const
framework
::
Tensor
&
index
,
const
framework
::
Tensor
&
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
->
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
->
dims
()[
0
]
&&
index
.
dims
()
==
value
.
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_put
=
index
.
dims
()[
1
];
auto
array_dims
=
array
->
dims
();
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
T
*
p_array
=
array
->
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
const
T
*
p_value
=
value
.
data
<
T
>
();
// slice sizes
const
auto
array_slice_size
=
array_dims
[
1
];
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_put
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
value_slice_size
+
j
];
}
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
static
void
compute_remove_accidental_hits
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
sampled_logits
,
const
framework
::
Tensor
&
samples
,
const
int
num_true
)
{
const
auto
batch_size
=
sampled_logits
->
dims
()[
0
];
const
auto
num_sampled_classes
=
sampled_logits
->
dims
()[
1
];
T
*
sampled_logits_data
=
sampled_logits
->
data
<
T
>
();
const
auto
samples_data
=
samples
.
data
<
int64_t
>
();
std
::
unordered_set
<
int64_t
>
tmp_true_labels
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
tmp_true_labels
.
clear
();
tmp_true_labels
.
insert
(
samples_data
+
i
*
num_sampled_classes
,
samples_data
+
i
*
num_sampled_classes
+
num_true
);
for
(
int
j
=
num_true
;
j
<
num_sampled_classes
;
++
j
)
{
const
auto
idx
=
i
*
num_sampled_classes
+
j
;
if
(
tmp_true_labels
.
find
(
samples_data
[
idx
])
!=
tmp_true_labels
.
end
())
sampled_logits_data
[
idx
]
-=
1e20
;
}
}
}
template
<
typename
T
>
class
SampleLogitsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
context
.
GetPlace
()),
"This kernel only runs on CPU."
);
VLOG
(
3
)
<<
"Enter SampleLogitsKernel"
;
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
sampled_labels_data
[
i
*
num_true
+
j
]
=
j
;
}
}
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
SampleWithProb
<
platform
::
CPUDeviceContext
,
T
>
();
sampler_with_prob
(
dev_ctx
,
math
::
LogUniformSampler
(
num_classes
,
seed
),
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
CPUTakeAlongD1
<
T
>
(
dev_ctx
,
*
logits
,
*
samples
,
sampled_logits
);
if
(
remove_accidental_hits
)
{
compute_remove_accidental_hits
<
T
>
(
dev_ctx
,
sampled_logits
,
*
samples
,
num_true
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
CPUPutAlongD1
<
T
>
(
dev_ctx
,
logits_grad
,
*
samples
,
*
sampled_logits_grad
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
浏览文件 @
c4187dbd
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
...
...
@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
}
}
template
<
typename
T
>
static
int
ExpandByMemoryCopy
(
const
platform
::
CUDADeviceContext
&
context
,
const
LoDTensor
&
x
,
LoDTensor
*
out
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
const
framework
::
Vector
<
size_t
>&
ref_lod
,
bool
do_copy
)
{
auto
out_data
=
out
->
data
<
T
>
();
auto
x_data
=
x
.
data
<
T
>
();
auto
&
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
());
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
int
out_offset
=
0
;
int
num_copys
=
0
;
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
int
repeat_num
=
ref_lod
[
i
]
-
ref_lod
[
i
-
1
];
int
x_start
=
x_lod
[
i
-
1
];
int
x_end
=
x_lod
[
i
];
int
x_seq_len
=
x_end
-
x_start
;
if
(
repeat_num
>
0
)
{
if
(
do_copy
)
{
int
out_start
=
out_offset
;
if
(
out
->
lod
().
size
()
==
1
)
{
out_start
=
out
->
lod
()[
0
][
out_offset
];
}
for
(
int
j
=
0
;
j
<
repeat_num
;
j
++
)
{
for
(
int
k
=
0
;
k
<
x_seq_len
;
k
++
)
{
memory
::
Copy
(
gpu_place
,
out_data
+
(
out_start
+
j
*
x_seq_len
+
k
)
*
x_item_length
,
gpu_place
,
x_data
+
(
x_start
+
k
)
*
x_item_length
,
sizeof
(
T
)
*
x_item_length
,
context
.
stream
());
}
}
}
else
{
num_copys
+=
repeat_num
*
x_seq_len
;
}
}
out_offset
+=
repeat_num
;
}
return
num_copys
;
}
template
<
typename
T
>
struct
SequenceExpandFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
...
...
@@ -95,11 +139,30 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
int
num_copys
=
ExpandByMemoryCopy
<
T
>
(
context
,
x
,
out
,
x_lod
,
ref_lod
,
false
);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
num_copys
<
5
)
{
ExpandByMemoryCopy
<
T
>
(
context
,
x
,
out
,
x_lod
,
ref_lod
,
true
);
}
else
{
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
framework
::
Vector
<
size_t
>
out_offset
(
x_lod
.
size
());
size_t
x_lod_size
=
x_lod
.
size
();
framework
::
Vector
<
size_t
>
out_offset
(
x_lod_size
*
2
+
ref_lod
.
size
());
GetOutputOffset
(
x_lod
,
ref_lod
,
&
out_offset
);
int
thread_x
=
std
::
min
(
32
,
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
16
));
for
(
size_t
i
=
0
;
i
<
x_lod_size
;
++
i
)
{
out_offset
[
x_lod_size
+
i
]
=
x_lod
[
i
];
}
for
(
size_t
i
=
0
;
i
<
ref_lod
.
size
();
++
i
)
{
out_offset
[
2
*
x_lod_size
+
i
]
=
ref_lod
[
i
];
}
const
size_t
*
out_offset_data
=
out_offset
.
CUDAData
(
context
.
GetPlace
());
const
size_t
*
x_lod_data
=
out_offset_data
+
x_lod_size
;
const
size_t
*
ref_lod_data
=
out_offset_data
+
2
*
x_lod_size
;
int
thread_x
=
std
::
min
(
32
,
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
16
));
int
thread_y
=
16
;
int
thread_z
=
1024
/
thread_x
/
thread_y
;
int
block_x
=
static_cast
<
int
>
(
ref_lod
.
size
());
...
...
@@ -107,10 +170,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
dim3
grid_size
(
block_x
,
1
);
sequence_expand_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
x_lod
.
CUDAData
(
context
.
GetPlace
()),
ref_lod
.
CUDAData
(
context
.
GetPlace
()),
out_offset
.
CUDAData
(
context
.
GetPlace
()),
x_lod
.
size
(),
x_item_length
,
out
->
mutable_data
<
T
>
(
context
.
GetPlace
()));
x
.
data
<
T
>
(),
x_lod_data
,
ref_lod_data
,
out_offset_data
,
x_lod_size
,
x_item_length
,
out
->
mutable_data
<
T
>
(
context
.
GetPlace
()));
}
}
};
...
...
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
浏览文件 @
c4187dbd
...
...
@@ -117,11 +117,11 @@ class TeacherStudentSigmoidLossOpMaker
"[N x 1]. The teacher student sigmoid loss."
);
AddAttr
<
float
>
(
"soft_max_up_bound"
,
"fp32, if input > soft_max_up_bound, will be bound, default 15.0"
)
"fp32, if input > soft_max_up_bound,
input
will be bound, default 15.0"
)
.
SetDefault
(
15.0
);
AddAttr
<
float
>
(
"soft_max_lower_bound"
,
"fp32, if input < soft_max_lower_bound, will be
bound, default -15.0"
)
AddAttr
<
float
>
(
"soft_max_lower_bound"
,
"fp32, if input < soft_max_lower_bound, input will be "
"
bound, default -15.0"
)
.
SetDefault
(
-
15.0
);
AddComment
(
R"DOC(
TeacherStudentSigmoidLoss Operator.
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
c4187dbd
...
...
@@ -87,11 +87,11 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library
(
timer SRCS timer.cc
)
cc_test
(
timer_test SRCS timer_test.cc DEPS timer
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
device_context
${
GPU_CTX_DEPS
}
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
if
(
WITH_GPU
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
context device_tracer
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
tracer gpu_info enforce
)
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_
context device_tracer
)
cc_library
(
profiler SRCS profiler.cc DEPS device_
tracer enforce
)
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
c4187dbd
...
...
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread
auto
map_it
=
pMap
->
find
(
tid
);
...
...
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread firstly
auto
map_it
=
pMap
->
find
(
tid
);
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
c4187dbd
...
...
@@ -136,7 +136,7 @@ void EnableActivity() {
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
// We don't track these activities for now.
//
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
...
...
@@ -155,7 +155,7 @@ void DisableActivity() {
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
//
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
...
...
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy
->
correlationId
,
memcpy
->
bytes
);
break
;
}
case
CUPTI_ACTIVITY_KIND_MEMSET
:
{
auto
*
memset
=
reinterpret_cast
<
const
CUpti_ActivityMemset
*>
(
record
);
tracer
->
AddKernelRecords
(
"MEMSET"
,
memset
->
start
,
memset
->
end
,
memset
->
deviceId
,
memset
->
streamId
,
memset
->
correlationId
);
break
;
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
...
...
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
...
...
@@ -601,6 +611,8 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR
(
cudaStreamSynchronize_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaStreamWaitEvent_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
c4187dbd
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
...
...
@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
class
Event
;
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
c4187dbd
...
...
@@ -34,6 +34,7 @@ limitations under the License. */
#include <type_traits>
#include <utility>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
...
...
paddle/fluid/platform/event.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#endif
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
c4187dbd
...
...
@@ -39,6 +39,45 @@ class MKLDNNHandler {
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
}
// TODO(jczaja): extract common part and make AcquireMemory
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_weights_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
user_function
custom_func
=
{})
{
...
...
@@ -232,7 +271,6 @@ class MKLDNNHandler {
AppendKey
(
key
,
suffix
);
}
protected:
static
void
AppendKeyDims
(
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
dims
)
{
for
(
unsigned
int
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
...
...
@@ -250,6 +288,7 @@ class MKLDNNHandler {
key
->
append
(
s
);
}
protected:
static
std
::
string
dims2str
(
const
mkldnn
::
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
...
...
@@ -263,6 +302,9 @@ class MKLDNNHandler {
mkldnn
::
engine
engine_
;
std
::
string
key_
;
bool
is_reusing_
;
public:
static
constexpr
int
MaxKeyLength
=
256
;
};
class
TransposeMKLDNNHandler
:
public
MKLDNNHandler
{
...
...
@@ -273,37 +315,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
dims_
(
dims
),
axis_
(
axis
),
logical_axis_
(
dims
.
size
(),
0
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
format
&
fmt
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for
(
size_t
i
=
0
;
i
<
logical_axis_
.
size
();
++
i
)
{
logical_axis_
[
i
]
=
i
;
}
auto
src_md
=
fmt
!=
mkldnn
::
memory
::
format
::
nchw
?
platform
::
MKLDNNMemDesc
(
dims_
,
platform
::
MKLDNNGetDataType
<
float
>
(),
fmt
)
:
Axis2MemoryDesc
(
dims_
,
logical_axis_
);
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
::
primitive_desc
{
src_md
,
engine_
},
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
axis_
(
axis
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
framework
::
Tensor
*
output
,
platform
::
Place
place
)
{
...
...
@@ -388,7 +400,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
private:
std
::
vector
<
int
>
dims_
;
std
::
vector
<
int
>
axis_
;
std
::
vector
<
int
>
logical_axis_
;
};
template
<
class
forward_t
,
class
backward_data_t
,
class
backward_weights_t
>
...
...
@@ -548,9 +559,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
weights_memory_p
,
*
dst_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
...
...
@@ -570,9 +580,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
())
,
*
(
bias_memory_p
.
get
()),
*
(
dst_memory_p
.
get
())
);
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
weights_memory_p
,
*
bias_memory_p
,
*
dst_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
...
...
paddle/fluid/platform/mkldnn_utils.h
0 → 100644
浏览文件 @
c4187dbd
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
namespace
paddle
{
namespace
platform
{
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_dims
(
const
std
::
vector
<
int
>&
ltz
,
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
data_type
::
f32
)
{
mkldnn_memory_desc_t
mem_fmt
;
mem_fmt
.
primitive_kind
=
mkldnn_memory
;
mem_fmt
.
ndims
=
ltz
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
ltz
.
size
();
++
i
)
{
mem_fmt
.
dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt
.
data_type
=
static_cast
<
mkldnn_data_type_t
>
(
data_type
);
mem_fmt
.
format
=
static_cast
<
mkldnn_memory_format_t
>
(
fmt
);
unsigned
int
total_stride
=
1
;
for
(
int
i
=
ltz
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
mem_fmt
.
layout_desc
.
blocking
.
padding_dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format, regardless physical
// layout)
mem_fmt
.
layout_desc
.
blocking
.
block_dims
[
i
]
=
1
;
mem_fmt
.
layout_desc
.
blocking
.
offset_padding_to_data
[
i
]
=
0
;
// no offset
mem_fmt
.
layout_desc
.
blocking
.
strides
[
0
][
i
]
=
total_stride
;
mem_fmt
.
layout_desc
.
blocking
.
strides
[
1
][
i
]
=
1
;
total_stride
*=
ltz
[
i
];
}
mem_fmt
.
layout_desc
.
blocking
.
offset_padding
=
0
;
// no initial offset
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
mem_fmt
,
cpu_engine
);
}
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_format
(
const
std
::
vector
<
int
>&
ltz
,
const
mkldnn
::
memory
::
format
format
,
const
mkldnn
::
memory
::
data_type
data_type
)
{
auto
md
=
mkldnn
::
memory
::
desc
({
ltz
},
data_type
,
format
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
PADDLE_ENFORCE_NOT_NULL
(
dev_ctx
,
"Could not get valid device"
);
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
md
,
cpu_engine
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler.cc
浏览文件 @
c4187dbd
...
...
@@ -254,9 +254,11 @@ struct EventItem {
std
::
string
name
;
int
calls
;
double
total_time
;
double
min_time
;
double
max_time
;
double
ave_time
;
double
min_time
;
double
cpu_time
;
double
gpu_time
;
float
ratio
;
};
...
...
@@ -290,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Output events table
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
<<
"Calls"
<<
std
::
setw
(
data_width
)
<<
"Total"
<<
std
::
setw
(
data_width
)
<<
"Min."
<<
std
::
setw
(
data_width
)
<<
"Calls"
<<
std
::
setw
(
data_width
)
<<
"Total"
;
if
(
g_state
==
ProfilerState
::
kAll
)
{
std
::
cout
<<
std
::
setw
(
data_width
*
2
)
<<
"CPU Time (Ratio)"
<<
std
::
setw
(
data_width
*
2
)
<<
"GPU Time (Ratio)"
;
}
std
::
cout
<<
std
::
setw
(
data_width
)
<<
"Min."
<<
std
::
setw
(
data_width
)
<<
"Max."
<<
std
::
setw
(
data_width
)
<<
"Ave."
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
...
...
@@ -299,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
;
if
(
g_state
==
ProfilerState
::
kAll
)
{
std
::
cout
<<
std
::
setw
(
data_width
*
2
)
<<
string
::
Sprintf
(
"%f (%f)"
,
event_item
.
cpu_time
,
(
event_item
.
cpu_time
/
event_item
.
total_time
))
<<
std
::
setw
(
data_width
*
2
)
<<
string
::
Sprintf
(
"%f (%f)"
,
event_item
.
gpu_time
,
(
event_item
.
gpu_time
/
event_item
.
total_time
));
}
std
::
cout
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
max_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ave_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ratio
<<
std
::
endl
;
...
...
@@ -349,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
return
a
.
ave_time
>
b
.
ave_time
;
};
break
;
case
EventSortingKey
::
kGPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
gpu_time
>
b
.
gpu_time
;
};
break
;
case
EventSortingKey
::
kCPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
cpu_time
>
b
.
cpu_time
;
};
break
;
default:
sorted_domain
=
"event first end time"
;
}
...
...
@@ -387,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
if
(
rit
!=
pushed_events
.
rend
())
{
double
event_time
=
(
g_state
==
ProfilerState
::
kCUDA
||
g_state
==
ProfilerState
::
kAll
)
?
rit
->
CudaElapsedMs
((
*
analyze_events
)[
i
][
j
])
:
rit
->
CpuElapsedMs
((
*
analyze_events
)[
i
][
j
]);
double
event_time
=
0
;
double
gpu_time
=
rit
->
CudaElapsedMs
((
*
analyze_events
)[
i
][
j
]);
double
cpu_time
=
rit
->
CpuElapsedMs
((
*
analyze_events
)[
i
][
j
]);
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
event_time
=
gpu_time
;
}
else
if
(
g_state
==
ProfilerState
::
kCPU
)
{
event_time
=
cpu_time
;
}
else
{
event_time
=
gpu_time
+
cpu_time
;
}
total
+=
event_time
;
std
::
string
event_name
;
...
...
@@ -407,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
event_idx
[
event_name
]
=
event_items
.
size
();
EventItem
event_item
=
{
event_name
,
1
,
event_time
,
event_time
,
event_time
,
event_time
,
0.
};
gpu_time
,
cpu_time
,
0.
};
event_items
.
push_back
(
event_item
);
}
else
{
int
index
=
event_idx
[
event_name
];
...
...
@@ -420,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// max time
event_items
[
index
].
max_time
=
std
::
max
(
event_time
,
event_items
[
index
].
max_time
);
event_items
[
index
].
gpu_time
+=
gpu_time
;
event_items
[
index
].
cpu_time
+=
cpu_time
;
}
// remove the push marker from the list
...
...
paddle/fluid/platform/profiler.cu
浏览文件 @
c4187dbd
...
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -22,26 +21,27 @@ namespace platform {
__global__
void
DummyKernel
(
int
*
a
)
{
a
[
0
]
=
0
;
}
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
auto
original_device
=
GetCurrentDeviceId
();
int
count
=
GetCUDADeviceCount
();
auto
original_device
=
platform
::
GetCurrentDeviceId
();
int
count
=
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetDeviceId
(
i
);
platform
::
SetDeviceId
(
i
);
func
(
i
);
}
SetDeviceId
(
original_device
);
platform
::
SetDeviceId
(
original_device
);
}
void
DummyKernelAndEvent
()
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ForEachDevice
([](
int
d
)
{
CUDADeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDAPlace
(
d
));
platform
::
SetDeviceId
(
d
);
cudaStream_t
stream
;
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
Mark
(
"_cuda_startup_"
);
int
*
ptr
;
PADDLE_ENFORCE
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
DummyKernel
<<<
1
,
1
,
0
,
dev_ctx
->
stream
()
>>>
(
ptr
);
dev_ctx
->
Wait
(
);
DummyKernel
<<<
1
,
1
,
0
,
stream
>>>
(
ptr
);
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
)
);
PADDLE_ENFORCE
(
cudaFree
(
ptr
));
delete
dev_ctx
;
});
}
}
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
c4187dbd
...
...
@@ -17,54 +17,13 @@ limitations under the License. */
#include <list>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#include "paddle/fluid/platform/gpu_info.h"
#endif
};
namespace
paddle
{
namespace
platform
{
enum
ProfilerState
{
kDisabled
,
// disabled state
...
...
@@ -117,7 +76,16 @@ struct RecordBlock {
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
// Candidate keys to sort the profiling report
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
};
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
,
kCPUTime
,
kGPUTime
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
...
...
paddle/fluid/platform/profiler_test.cc
浏览文件 @
c4187dbd
...
...
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
}
TEST
(
RecordEvent
,
RecordEvent
)
{
using
paddle
::
platform
::
DeviceContext
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventType
;
using
paddle
::
platform
::
RecordEvent
;
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
c4187dbd
...
...
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
...
...
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
...
...
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
...
...
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
size_t
memory_size
=
500
;
int
numel
=
memory_size
/
sizeof
(
float
);
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
c4187dbd
...
...
@@ -34,7 +34,7 @@ void BindTracer(pybind11::module* m) {
framework
::
BlockDesc
*
block
,
const
platform
::
CPUPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
return
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
})
.
def
(
"trace"
,
...
...
@@ -44,7 +44,7 @@ void BindTracer(pybind11::module* m) {
framework
::
BlockDesc
*
block
,
const
platform
::
CUDAPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
return
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
})
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
...
...
paddle/fluid/pybind/ir.cc
浏览文件 @
c4187dbd
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/pybind/ir.h"
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
...
...
@@ -101,7 +102,8 @@ void BindGraph(py::module *m) {
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
return_value_policy
::
reference
)
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
);
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
)
.
def
(
"origin_program_desc"
,
&
Graph
::
OriginProgram
);
}
void
BindNode
(
py
::
module
*
m
)
{
...
...
@@ -115,7 +117,7 @@ void BindNode(py::module *m) {
.
def
(
"is_var"
,
&
Node
::
IsVar
)
.
def
(
"is_ctrl_var"
,
&
Node
::
IsCtrlVar
)
.
def
(
"clear_inputs"
,
[](
Node
&
self
)
{
self
.
inputs
.
clear
();
})
.
def
(
"
inputs_remove
"
,
.
def
(
"
remove_input
"
,
[](
Node
&
self
,
int
node_id
)
{
auto
pos
=
std
::
find_if
(
self
.
inputs
.
begin
(),
self
.
inputs
.
end
(),
...
...
@@ -124,7 +126,7 @@ void BindNode(py::module *m) {
self
.
inputs
.
erase
(
pos
);
}
})
.
def
(
"
inputs_remove
"
,
.
def
(
"
remove_input
"
,
[](
Node
&
self
,
Node
&
node
)
{
auto
pos
=
std
::
find
(
self
.
inputs
.
begin
(),
self
.
inputs
.
end
(),
&
node
);
...
...
@@ -132,10 +134,10 @@ void BindNode(py::module *m) {
self
.
inputs
.
erase
(
pos
);
}
})
.
def
(
"
inputs_append
"
,
.
def
(
"
append_input
"
,
[](
Node
&
self
,
Node
&
node
)
{
self
.
inputs
.
push_back
(
&
node
);
})
.
def
(
"clear_outputs"
,
[](
Node
&
self
)
{
self
.
outputs
.
clear
();
})
.
def
(
"
outputs_remove
"
,
.
def
(
"
remove_output
"
,
[](
Node
&
self
,
int
node_id
)
{
auto
pos
=
std
::
find_if
(
self
.
outputs
.
begin
(),
self
.
outputs
.
end
(),
...
...
@@ -144,7 +146,7 @@ void BindNode(py::module *m) {
self
.
outputs
.
erase
(
pos
);
}
})
.
def
(
"
outputs_remove
"
,
.
def
(
"
remove_output
"
,
[](
Node
&
self
,
Node
&
node
)
{
auto
pos
=
std
::
find
(
self
.
outputs
.
begin
(),
self
.
outputs
.
end
(),
&
node
);
...
...
@@ -152,7 +154,7 @@ void BindNode(py::module *m) {
self
.
outputs
.
erase
(
pos
);
}
})
.
def
(
"
outputs_append
"
,
.
def
(
"
append_output
"
,
[](
Node
&
self
,
Node
&
node
)
{
self
.
outputs
.
push_back
(
&
node
);
})
.
def_readwrite
(
"inputs"
,
&
Node
::
inputs
)
.
def_readwrite
(
"outputs"
,
&
Node
::
outputs
);
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
c4187dbd
...
...
@@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) {
return
self
.
HasVar
(
name
);
},
pybind11
::
return_value_policy
::
reference
)
.
def
(
"_clear_block"
,
[](
pd
::
BlockDesc
&
self
)
{
return
self
.
Clear
();
},
pybind11
::
return_value_policy
::
reference
)
.
def
(
"_rename_var"
,
[](
pd
::
BlockDesc
&
self
,
const
pybind11
::
bytes
&
byte_name
,
const
pybind11
::
bytes
&
byte_name_new
)
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
c4187dbd
...
...
@@ -976,6 +976,7 @@ All parameter, weight, gradient are variables in Paddle.
[](
ir
::
PassBuilder
&
self
,
size_t
idx
)
{
self
.
RemovePass
(
idx
);
});
// -- python binds for parallel executor.
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
...
...
@@ -1213,9 +1214,9 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"
);
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
>
())
const
std
::
unordered_set
<
std
::
string
>
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
ir
::
Graph
*
>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
paddle/fluid/train/demo/demo_trainer.cc
浏览文件 @
c4187dbd
...
...
@@ -73,7 +73,7 @@ int main() {
PADDLE_ENFORCE_NE
(
loss_name
,
""
,
"loss not found"
);
// init all parameters
executor
.
Run
(
*
startup_program
.
get
()
,
&
scope
,
0
);
executor
.
Run
(
*
startup_program
,
&
scope
,
0
);
// prepare data
auto
x_var
=
scope
.
Var
(
"x"
);
...
...
@@ -101,7 +101,7 @@ int main() {
clock_t
t1
=
clock
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
std
::
cout
<<
"step: "
<<
i
<<
" loss: "
<<
loss_var
->
Get
<
paddle
::
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
]
<<
std
::
endl
;
...
...
paddle/fluid/train/test_train_recognize_digits.cc
浏览文件 @
c4187dbd
...
...
@@ -74,7 +74,7 @@ void Train() {
float
first_loss
=
0.0
;
float
last_loss
=
0.0
;
for
(
int
i
=
0
;
i
<
100
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
if
(
i
==
0
)
{
first_loss
=
loss_var
->
Get
<
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
];
}
else
if
(
i
==
99
)
{
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
c4187dbd
...
...
@@ -444,6 +444,7 @@ function assert_api_spec_approvals() {
"paddle/fluid/framework/ir/node.h"
"paddle/fluid/framework/ir/graph.h"
"paddle/fluid/framework/framework.proto"
"python/paddle/fluid/compiler.py"
"paddle/fluid/operators/distributed/send_recv.proto.in"
)
for
API_FILE
in
${
API_FILES
[*]
}
;
do
API_CHANGE
=
`
git diff
--name-only
upstream/
$BRANCH
|
grep
"
${
API_FILE
}
"
||
true
`
...
...
python/paddle/fluid/compiler.py
浏览文件 @
c4187dbd
...
...
@@ -17,8 +17,10 @@ import os
import
six
import
sys
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
core
from
.
import
framework
__all__
=
[
'CompiledProgram'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
...
...
@@ -36,7 +38,7 @@ def _place_obj(place):
class
CompiledProgram
(
object
):
"""
Compiles
a Program
for execution.
Compiles
to Graph
for execution.
1. Users first create the program with layers.
2. Optionally, users use CompiledProgram to optimize the program before run.
...
...
@@ -51,7 +53,7 @@ class CompiledProgram(object):
Example:
.. code-block:: python
place = fluid.CUDAPlace(0) if use_
cuda
else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_
gpu
else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
...
...
@@ -62,11 +64,25 @@ class CompiledProgram(object):
fetch_list=[loss.name])
Args:
program: Program instance that contains the model logic.
program_or_graph (Graph|Program): If it's Program, it will be first
lowered to a graph for further optimizations. If it's a graph
(potentially optimized before), it will be directly used for
further optimizations. Note: graph is only supported when compiled
with with_data_parallel option.
"""
def
__init__
(
self
,
program
):
self
.
_program
=
program
def
__init__
(
self
,
program_or_graph
):
if
isinstance
(
program_or_graph
,
core
.
Graph
):
self
.
_graph
=
program_or_graph
self
.
_program
=
None
elif
isinstance
(
program_or_graph
,
framework
.
Program
):
self
.
_graph
=
core
.
Graph
(
program_or_graph
.
desc
)
self
.
_program
=
program_or_graph
else
:
raise
ValueError
(
"Wrong program_to_graph type: %s"
%
type
(
program_or_graph
))
self
.
_program_desc
=
self
.
_graph
.
origin_program_desc
()
self
.
_scope
=
None
self
.
_place
=
None
self
.
_executor
=
None
...
...
@@ -101,6 +117,7 @@ class CompiledProgram(object):
self
"""
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
assert
not
self
.
_is_inference
,
"Cannot compile both data parallel and inference"
self
.
_is_data_parallel
=
True
self
.
_build_strategy
=
build_strategy
self
.
_exec_strategy
=
exec_strategy
...
...
@@ -110,6 +127,8 @@ class CompiledProgram(object):
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
self
.
_program
)
return
self
def
with_inference_optimize
(
self
,
config
):
...
...
@@ -120,11 +139,13 @@ class CompiledProgram(object):
Returns:
self
"""
assert
not
self
.
_is_data_parallel
,
"Cannot compile both data parallel and inference"
assert
not
self
.
_is_inference
,
"Already compiled with inference"
assert
any
([
isinstance
(
config
,
InferNativeConfig
),
isinstance
(
config
,
InferAnalysisConfig
)
])
self
.
_is_data_parallel
=
False
self
.
_is_inference
=
True
self
.
_infer_config
=
config
return
self
...
...
@@ -173,37 +194,41 @@ class CompiledProgram(object):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
trainers_endpoints
=
self
.
_program
.
_trainers_endpoints
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
if
self
.
_program
and
self
.
_build_strategy
.
num_trainers
>
1
and
\
self
.
_program
.
_trainers_endpoints
:
tps
=
self
.
_program
.
_trainers_endpoints
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
t
rainers_endpoint
s
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
t
rainers_endpoint
s
self
.
_persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
self
.
_program
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
t
p
s
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
t
p
s
self
.
_persistable_vars
=
[]
for
block_id
in
range
(
self
.
_program_desc
.
num_blocks
()):
bdesc
=
self
.
_program_desc
.
block
(
block_id
)
self
.
_persistable_vars
.
extend
([
cpt
.
to_text
(
v
.
name
())
for
v
in
bdesc
.
all_vars
()
if
v
.
persistable
()
and
v
.
type
()
!=
core
.
VarDesc
.
VarType
.
RAW
])
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
self
.
_program
.
desc
,
places
,
set
(
self
.
_persistable_vars
),
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
def
_compile_inference
(
self
):
assert
self
.
_is_data_parallel
is
False
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
def
_compile
(
self
,
scope
,
place
):
...
...
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
浏览文件 @
c4187dbd
...
...
@@ -17,7 +17,9 @@ import numpy as np
import
six
from
.....
import
compat
as
cpt
from
....
import
core
from
....
import
Executor
from
....framework
import
IrGraph
from
....framework
import
IrNode
from
....framework
import
Program
from
....initializer
import
Constant
from
....
import
unique_name
...
...
@@ -31,7 +33,7 @@ __all__ = [
class
QuantizationTransformPass
(
object
):
def
__init__
(
self
,
scope
=
None
,
p
rogram_ex
e
=
None
,
p
lac
e
=
None
,
weight_bits
=
8
,
activation_bits
=
8
,
activation_quantize_type
=
'abs_max'
,
...
...
@@ -45,7 +47,7 @@ class QuantizationTransformPass(object):
scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
type, this pass will create some new parameters. The scope is used to
initialize these new parameters.
p
rogram_exe(fluid.Executor): program_ex
e is used to initialize new
p
lace(fluid.CPUPlace|fluid.CUDAPlace): plac
e is used to initialize new
parameters described above.
weight_bits (int): quantization bit number for weights,
the bias is not quantized.
...
...
@@ -71,13 +73,13 @@ class QuantizationTransformPass(object):
from paddle.fluid import core
graph = IrGraph(core.Graph(program.desc), for_test=False)
exe = fluid.Executor(fluid.CPUPlace()
)
place = fluid.CPUPlace(
)
transform_pass = QuantizationTransformPass(fluid.global_scope(),
ex
e)
plac
e)
transform_pass.apply(graph)
"""
self
.
_scope
=
scope
self
.
_p
rogram_exe
=
program_ex
e
self
.
_p
lace
=
plac
e
self
.
_weight_bits
=
weight_bits
self
.
_activation_bits
=
activation_bits
...
...
@@ -118,7 +120,7 @@ class QuantizationTransformPass(object):
self
.
_is_test
=
graph
.
is_test
()
# marked the variable which has been dequantized.
dequantized_vars
=
collections
.
OrderedDict
()
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
var
s
()]
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
node
s
()]
def
_transform_forward
(
graph
,
op
):
for
var_node
in
op
.
inputs
:
...
...
@@ -149,7 +151,7 @@ class QuantizationTransformPass(object):
if
not
self
.
_is_test
:
self
.
_create_global_step
(
graph
)
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
# The process of _transform_forward and _transform_backward is needed in two for loops.
# The loop for transforming the forward graph:
for
op
in
ops
:
...
...
@@ -163,8 +165,8 @@ class QuantizationTransformPass(object):
if
len
(
self
.
_need_initialized
)
>
0
:
assert
self
.
_scope
is
not
None
,
\
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert
self
.
_p
rogram_ex
e
is
not
None
,
\
'The p
rogram_ex
e cannot be set None when activation_quantize_type equals to range_abs_max.'
assert
self
.
_p
lac
e
is
not
None
,
\
'The p
lac
e cannot be set None when activation_quantize_type equals to range_abs_max.'
init_program
=
Program
()
for
var_desc
,
initializer
in
six
.
iteritems
(
self
.
_need_initialized
):
var
=
init_program
.
global_block
().
create_var
(
...
...
@@ -175,7 +177,8 @@ class QuantizationTransformPass(object):
lod_level
=
var_desc
.
lod_level
(),
persistable
=
var_desc
.
persistable
())
initializer
(
var
,
init_program
.
global_block
())
self
.
_program_exe
.
run
(
program
=
init_program
,
scope
=
self
.
_scope
)
exe
=
Executor
(
self
.
_place
)
exe
.
run
(
program
=
init_program
,
scope
=
self
.
_scope
)
return
graph
...
...
@@ -183,11 +186,11 @@ class QuantizationTransformPass(object):
if
self
.
_weight_quantize_type
==
'range_abs_max'
or
\
self
.
_activation_quantize_type
==
'range_abs_max'
:
counter_name
=
cpt
.
to_text
(
'@STEP_COUNTER@'
)
for
node
in
graph
.
all_vars
():
for
node
in
graph
.
all_var
_node
s
():
if
node
.
name
()
==
counter_name
:
self
.
_global_step
=
node
if
self
.
_global_step
is
None
:
global_step_in
=
graph
.
create_p
aram
_node
(
global_step_in
=
graph
.
create_p
ersistable
_node
(
name
=
counter_name
,
var_type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
shape
=
[
1
],
...
...
@@ -228,14 +231,14 @@ class QuantizationTransformPass(object):
quant_var_node
=
graph
.
create_var_node
(
name
=
self
.
_quantized_var_name
(
var_node
.
name
()),
var_type
=
var_node
.
var
().
type
(),
shape
=
var_node
.
var
().
shape
(),
var_dtype
=
var_node
.
var
().
dtype
())
var_type
=
var_node
.
type
(),
shape
=
var_node
.
shape
(),
var_dtype
=
var_node
.
dtype
())
scale_var_node
=
graph
.
create_var_node
(
name
=
self
.
_quantized_scale_name
(
var_node
.
name
()),
var_type
=
var_node
.
var
().
type
(),
shape
=
var_node
.
var
().
shape
(),
var_dtype
=
var_node
.
var
().
dtype
())
var_type
=
var_node
.
type
(),
shape
=
var_node
.
shape
(),
var_dtype
=
var_node
.
dtype
())
quant_op_node
=
graph
.
create_op_node
(
op_type
=
'fake_quantize_abs_max'
,
attrs
=
{
...
...
@@ -258,15 +261,15 @@ class QuantizationTransformPass(object):
quant_var_node
=
graph
.
create_var_node
(
name
=
self
.
_quantized_var_name
(
var_node
.
name
()),
var_type
=
var_node
.
var
().
type
(),
shape
=
var_node
.
var
().
shape
(),
var_dtype
=
var_node
.
var
().
dtype
())
var_type
=
var_node
.
type
(),
shape
=
var_node
.
shape
(),
var_dtype
=
var_node
.
dtype
())
scale_in_node
=
graph
.
create_p
aram
_node
(
scale_in_node
=
graph
.
create_p
ersistable
_node
(
name
=
self
.
_quantized_scale_name
(
var_node
.
name
()),
var_type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
shape
=
[
1
],
var_dtype
=
var_node
.
var
().
dtype
())
var_dtype
=
var_node
.
dtype
())
self
.
_need_initialized
[
scale_in_node
.
var
()]
=
Constant
(
value
=
0.001
)
scale_out_node
=
graph
.
create_var_node_from_desc
(
scale_in_node
.
var
())
...
...
@@ -275,11 +278,11 @@ class QuantizationTransformPass(object):
if
not
self
.
_is_test
:
# The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
scales_node
=
graph
.
create_p
aram
_node
(
scales_node
=
graph
.
create_p
ersistable
_node
(
name
=
unique_name
.
generate
(
'scales'
),
var_type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
shape
=
[
self
.
_window_size
],
var_dtype
=
var_node
.
var
().
dtype
())
var_dtype
=
var_node
.
dtype
())
self
.
_need_initialized
[
scales_node
.
var
()]
=
Constant
(
value
=
0
)
inputs
[
'Iter'
]
=
self
.
_global_step
outputs
[
'OutScales'
]
=
scales_node
...
...
@@ -314,9 +317,9 @@ class QuantizationTransformPass(object):
dequant_var_node
=
graph
.
create_var_node
(
name
=
self
.
_dequantized_var_name
(
var_node
.
name
()),
var_type
=
var_node
.
var
().
type
(),
shape
=
var_node
.
var
().
shape
(),
var_dtype
=
var_node
.
var
().
dtype
())
var_type
=
var_node
.
type
(),
shape
=
var_node
.
shape
(),
var_dtype
=
var_node
.
dtype
())
max_range
=
(
1
<<
(
quant_bits
-
1
))
-
1
dequant_op_node
=
graph
.
create_op_node
(
op_type
=
'fake_dequantize_max_abs'
,
...
...
@@ -400,22 +403,22 @@ class QuantizationFreezePass(object):
Args:
graph(IrGraph): the applied graph.
"""
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
var
s
()]
ops
=
graph
.
all_ops
()
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
node
s
()]
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
op_name
=
op_node
.
name
()
if
op_name
in
self
.
_fake_quant_op_names
:
input_arg_name
=
op_node
.
op
().
input
(
'X'
)[
0
]
input_arg_name
=
op_node
.
input
(
'X'
)[
0
]
if
input_arg_name
in
persistable_vars
:
if
self
.
_weight_quantize_type
==
'abs_max'
:
param
=
self
.
_load_var
(
input_arg_name
)
scale_v
=
np
.
max
(
np
.
abs
(
param
))
else
:
scale_v
=
self
.
_load_var
(
op_node
.
op
().
output
(
'OutScale'
)
[
0
])[
0
]
scale_v
=
self
.
_load_var
(
op_node
.
output
(
'OutScale'
)
[
0
])[
0
]
self
.
_var_scale_map
[
input_arg_name
]
=
scale_v
else
:
scale_v
=
graph
.
var_node
(
op_node
.
o
p
().
o
utput
(
'OutScale'
)[
0
])
scale_v
=
graph
.
var_node
(
op_node
.
output
(
'OutScale'
)[
0
])
self
.
_var_scale_map
[
input_arg_name
]
=
scale_v
if
input_arg_name
in
persistable_vars
:
self
.
_remove_fake_quant_and_dequant_op
(
graph
,
op_node
)
...
...
@@ -425,13 +428,13 @@ class QuantizationFreezePass(object):
self
.
_weight_bits
)
self
.
_restore_var
(
input_arg_name
,
quantized_param_v
)
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
op_name
=
op_node
.
name
()
if
op_name
in
self
.
_fake_dequant_op_names
:
self
.
_remove_fake_quant_and_dequant_op
(
graph
,
op_node
)
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
op_name
=
op_node
.
name
()
if
op_name
in
self
.
_quantizable_ops
:
...
...
@@ -451,8 +454,8 @@ class QuantizationFreezePass(object):
return
graph
def
_remove_fake_quant_and_dequant_op
(
self
,
graph
,
op_node
):
k
=
op_node
.
o
p
().
o
utput
(
'Out'
)[
0
]
v
=
op_node
.
op
().
input
(
'X'
)[
0
]
k
=
op_node
.
output
(
'Out'
)[
0
]
v
=
op_node
.
input
(
'X'
)[
0
]
if
v
not
in
self
.
_op_input_rename_map
:
self
.
_op_input_rename_map
[
k
]
=
v
else
:
...
...
@@ -462,7 +465,7 @@ class QuantizationFreezePass(object):
def
_insert_post_dequant_op
(
self
,
graph
,
op_node
):
max_range
=
None
scale_var_node
=
None
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
var
s
()]
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
node
s
()]
for
var_node
in
op_node
.
inputs
:
name
=
var_node
.
name
()
if
name
in
self
.
_op_input_rename_map
:
...
...
@@ -480,7 +483,7 @@ class QuantizationFreezePass(object):
original_var_name
)
max_range
=
param_range
*
act_range
/
scale_v
else
:
assert
isinstance
(
scale_v
,
core
.
Node
)
assert
isinstance
(
scale_v
,
Ir
Node
)
scale_var_node
=
self
.
_var_scale_map
[
original_var_name
]
if
len
(
op_node
.
outputs
)
!=
1
:
...
...
@@ -490,9 +493,9 @@ class QuantizationFreezePass(object):
output_var_node
=
op_node
.
outputs
[
0
]
dequant_var_node
=
graph
.
create_var_node
(
name
=
self
.
_dequantized_var_name
(
output_var_node
.
name
()),
var_type
=
output_var_node
.
var
().
type
(),
shape
=
output_var_node
.
var
().
shape
(),
var_dtype
=
output_var_node
.
var
().
dtype
())
var_type
=
output_var_node
.
type
(),
shape
=
output_var_node
.
shape
(),
var_dtype
=
output_var_node
.
dtype
())
dequant_op_node
=
graph
.
create_op_node
(
op_type
=
'fake_dequantize_max_abs'
,
attrs
=
{
...
...
@@ -517,14 +520,19 @@ class QuantizationFreezePass(object):
def
_remove_unused_var_nodes
(
self
,
graph
):
all_used_vars
=
set
()
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
for
input_node
in
op_node
.
inputs
:
all_used_vars
.
add
(
input_node
)
for
output_node
in
op_node
.
outputs
:
all_used_vars
.
add
(
output_node
)
all_unused_vars
=
graph
.
all_vars
()
-
all_used_vars
all_used_vars
=
{
n
.
node
for
n
in
all_used_vars
}
all_unused_vars
=
{
n
for
n
in
filter
(
lambda
node
:
node
.
node
not
in
all_used_vars
,
graph
.
all_var_nodes
())
}
graph
.
safe_remove_nodes
(
all_unused_vars
)
def
_original_var_name
(
self
,
var_name
):
...
...
@@ -583,8 +591,8 @@ class ConvertToInt8Pass(object):
Args:
graph(IrGraph): the applied graph.
"""
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
var
s
()]
ops
=
graph
.
all_ops
()
persistable_vars
=
[
p
.
name
()
for
p
in
graph
.
all_persistable_
node
s
()]
ops
=
graph
.
all_op
_node
s
()
input_map
=
{}
for
op_node
in
ops
:
op_name
=
op_node
.
name
()
...
...
@@ -605,10 +613,10 @@ class ConvertToInt8Pass(object):
def
_convert_to_int8
(
self
,
graph
,
var_node
):
int8_var_node_name
=
var_node
.
name
()
+
".int8"
int8_var_node
=
graph
.
create_p
aram
_node
(
int8_var_node
=
graph
.
create_p
ersistable
_node
(
name
=
cpt
.
to_text
(
int8_var_node_name
),
var_type
=
var_node
.
var
().
type
(),
shape
=
var_node
.
var
().
shape
(),
var_type
=
var_node
.
type
(),
shape
=
var_node
.
shape
(),
var_dtype
=
core
.
VarDesc
.
VarType
.
INT8
)
array
=
self
.
_load_var
(
var_node
.
name
())
self
.
_scope
.
var
(
int8_var_node_name
)
...
...
@@ -624,14 +632,19 @@ class ConvertToInt8Pass(object):
def
_remove_unused_var_nodes
(
self
,
graph
):
all_used_vars
=
set
()
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
for
input_node
in
op_node
.
inputs
:
all_used_vars
.
add
(
input_node
)
for
output_node
in
op_node
.
outputs
:
all_used_vars
.
add
(
output_node
)
all_unused_vars
=
graph
.
all_vars
()
-
all_used_vars
all_used_vars
=
{
n
.
node
for
n
in
all_used_vars
}
all_unused_vars
=
{
n
for
n
in
filter
(
lambda
node
:
node
.
node
not
in
all_used_vars
,
graph
.
all_var_nodes
())
}
graph
.
safe_remove_nodes
(
all_unused_vars
)
...
...
@@ -655,11 +668,11 @@ class TransformForMobilePass(object):
Args:
graph(IrGraph): the graph will be transformed.
"""
ops
=
graph
.
all_ops
()
ops
=
graph
.
all_op
_node
s
()
for
op_node
in
ops
:
name
=
op_node
.
name
()
if
name
in
self
.
_fake_quant_op_names
:
op_node
.
op
().
set_type
(
'quantize'
)
op_node
.
set_type
(
'quantize'
)
quant_node
=
graph
.
create_op_node_from_desc
(
op_node
.
op
())
for
input_node
in
op_node
.
inputs
:
graph
.
link_to
(
input_node
,
quant_node
)
...
...
@@ -667,7 +680,7 @@ class TransformForMobilePass(object):
graph
.
link_to
(
quant_node
,
output_node
)
graph
.
safe_remove_nodes
(
op_node
)
if
name
in
self
.
_fake_dequant_op_names
:
op_node
.
op
().
set_type
(
'dequantize'
)
op_node
.
set_type
(
'dequantize'
)
dequant_node
=
graph
.
create_op_node_from_desc
(
op_node
.
op
())
for
input_node
in
op_node
.
inputs
:
graph
.
link_to
(
input_node
,
dequant_node
)
...
...
python/paddle/fluid/contrib/slim/tests/test_graph.py
浏览文件 @
c4187dbd
...
...
@@ -61,16 +61,16 @@ class TestGraph(unittest.TestCase):
opt
.
minimize
(
loss
)
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
marked_nodes
=
set
()
for
op
in
graph
.
all_ops
():
for
op
in
graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'conv2d'
)
>
-
1
:
marked_nodes
.
add
(
op
)
graph
.
draw
(
'.'
,
'residual'
,
marked_nodes
)
self
.
assertFalse
(
graph
.
has_circle
())
self
.
assertEqual
(
graph
.
graph_num
(),
1
)
nodes
=
graph
.
topology_sort
()
self
.
assertEqual
(
len
(
nodes
),
len
(
graph
.
all_ops
()))
self
.
assertEqual
(
len
(
nodes
),
len
(
graph
.
all_op
_node
s
()))
nodes_map
=
graph
.
build_adjacency_list
()
self
.
assertEqual
(
len
(
nodes_map
),
len
(
graph
.
all_ops
()))
self
.
assertEqual
(
len
(
nodes_map
),
len
(
graph
.
all_op
_node
s
()))
nodes_num
=
len
(
graph
.
all_nodes
())
graph
.
safe_remove_nodes
(
marked_nodes
)
self
.
assertEqual
(
len
(
graph
.
all_nodes
()),
nodes_num
-
len
(
marked_nodes
))
...
...
python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
浏览文件 @
c4187dbd
...
...
@@ -130,15 +130,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
loss
=
linear_fc
(
3
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
transform_pass
=
QuantizationTransformPass
(
scope
=
fluid
.
global_scope
(),
p
rogram_exe
=
ex
e
,
p
lace
=
plac
e
,
activation_quantize_type
=
quant_type
)
transform_pass
.
apply
(
graph
)
marked_nodes
=
set
()
for
op
in
graph
.
all_ops
():
for
op
in
graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
graph
.
draw
(
'.'
,
'quantize_fc_'
+
quant_type
,
marked_nodes
)
...
...
@@ -146,7 +147,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
self
.
check_program
(
transform_pass
,
program
)
val_graph
=
IrGraph
(
core
.
Graph
(
program
.
desc
),
for_test
=
False
)
val_marked_nodes
=
set
()
for
op
in
val_graph
.
all_ops
():
for
op
in
val_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
val_marked_nodes
.
add
(
op
)
val_graph
.
draw
(
'.'
,
'val_fc_'
+
quant_type
,
val_marked_nodes
)
...
...
@@ -166,15 +167,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
loss
=
residual_block
(
2
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opt
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
graph
=
IrGraph
(
core
.
Graph
(
main
.
desc
),
for_test
=
False
)
transform_pass
=
QuantizationTransformPass
(
scope
=
fluid
.
global_scope
(),
p
rogram_exe
=
ex
e
,
p
lace
=
plac
e
,
activation_quantize_type
=
quant_type
)
transform_pass
.
apply
(
graph
)
marked_nodes
=
set
()
for
op
in
graph
.
all_ops
():
for
op
in
graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
graph
.
draw
(
'.'
,
'quantize_residual_'
+
quant_type
,
marked_nodes
)
...
...
@@ -182,7 +184,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
self
.
check_program
(
transform_pass
,
program
)
val_graph
=
IrGraph
(
core
.
Graph
(
program
.
desc
),
for_test
=
False
)
val_marked_nodes
=
set
()
for
op
in
val_graph
.
all_ops
():
for
op
in
val_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
val_marked_nodes
.
add
(
op
)
val_graph
.
draw
(
'.'
,
'val_residual_'
+
quant_type
,
val_marked_nodes
)
...
...
@@ -231,17 +233,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
with
fluid
.
scope_guard
(
scope
):
exe
.
run
(
startup
)
transform_pass
=
QuantizationTransformPass
(
scope
=
scope
,
p
rogram_exe
=
ex
e
,
activation_quantize_type
=
quant_type
)
scope
=
scope
,
p
lace
=
plac
e
,
activation_quantize_type
=
quant_type
)
transform_pass
.
apply
(
main_graph
)
transform_pass
.
apply
(
test_graph
)
dev_name
=
'_gpu_'
if
use_cuda
else
'_cpu_'
marked_nodes
=
set
()
for
op
in
main_graph
.
all_ops
():
for
op
in
main_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
main_graph
.
draw
(
'.'
,
'main'
+
dev_name
+
quant_type
,
marked_nodes
)
marked_nodes
=
set
()
for
op
in
test_graph
.
all_ops
():
for
op
in
test_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
test_graph
.
draw
(
'.'
,
'test'
+
dev_name
+
quant_type
,
marked_nodes
)
...
...
@@ -251,11 +253,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
iters
=
5
batch_size
=
8
#train_exe = fluid.ParallelExecutor(
# main_program=quantized_main_program,
# use_cuda=bool(use_cuda),
# loss_name=loss.name,
# scope=scope)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
mnist
.
train
(),
buf_size
=
500
),
...
...
@@ -269,9 +266,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
loss_v
=
exe
.
run
(
program
=
quantized_main_program
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
#loss_v = train_exe.run(feed=feeder.feed(data),
# fetch_list=[loss.name])
#print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
print
(
'{}: {}'
.
format
(
'loss'
+
dev_name
+
quant_type
,
loss_v
))
test_data
=
next
(
test_reader
())
with
fluid
.
program_guard
(
quantized_test_program
):
...
...
@@ -287,7 +282,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
freeze_pass
=
QuantizationFreezePass
(
scope
=
scope
,
place
=
place
)
freeze_pass
.
apply
(
test_graph
)
marked_nodes
=
set
()
for
op
in
test_graph
.
all_ops
():
for
op
in
test_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
test_graph
.
draw
(
'.'
,
'test_freeze'
+
dev_name
+
quant_type
,
...
...
@@ -299,21 +294,21 @@ class TestQuantizationFreezePass(unittest.TestCase):
feed
=
feeder
.
feed
(
test_data
),
fetch_list
=
[
loss
])
self
.
assertAlmostEqual
(
test_loss1
,
test_loss2
,
delta
=
5e-3
)
#
print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
#
print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
print
(
'{}: {}'
.
format
(
'test_loss1'
+
dev_name
+
quant_type
,
test_loss1
))
print
(
'{}: {}'
.
format
(
'test_loss2'
+
dev_name
+
quant_type
,
test_loss2
))
w_freeze
=
np
.
array
(
scope
.
find_var
(
'conv2d_1.w_0'
).
get_tensor
())
# Maybe failed, this is due to the calculation precision
# self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
#
print('{}: {}'.format('w_freeze' + dev_name + quant_type,
#
np.sum(w_freeze)))
#
print('{}: {}'.format('w_quant' + dev_name + quant_type,
#
np.sum(w_quant)))
print
(
'{}: {}'
.
format
(
'w_freeze'
+
dev_name
+
quant_type
,
np
.
sum
(
w_freeze
)))
print
(
'{}: {}'
.
format
(
'w_quant'
+
dev_name
+
quant_type
,
np
.
sum
(
w_quant
)))
# Convert parameter to 8-bit.
convert_int8_pass
=
ConvertToInt8Pass
(
scope
=
scope
,
place
=
place
)
convert_int8_pass
.
apply
(
test_graph
)
marked_nodes
=
set
()
for
op
in
test_graph
.
all_ops
():
for
op
in
test_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
test_graph
.
draw
(
'.'
,
'test_int8'
+
dev_name
+
quant_type
,
marked_nodes
)
...
...
@@ -330,14 +325,14 @@ class TestQuantizationFreezePass(unittest.TestCase):
w_8bit
=
np
.
array
(
scope
.
find_var
(
'conv2d_1.w_0.int8'
).
get_tensor
())
self
.
assertEqual
(
w_8bit
.
dtype
,
np
.
int8
)
self
.
assertEqual
(
np
.
sum
(
w_8bit
),
np
.
sum
(
w_freeze
))
#
print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
#
print('{}: {}'.format('w_freeze' + dev_name + quant_type,
#
np.sum(w_freeze)))
print
(
'{}: {}'
.
format
(
'w_8bit'
+
dev_name
+
quant_type
,
np
.
sum
(
w_8bit
)))
print
(
'{}: {}'
.
format
(
'w_freeze'
+
dev_name
+
quant_type
,
np
.
sum
(
w_freeze
)))
mobile_pass
=
TransformForMobilePass
()
mobile_pass
.
apply
(
test_graph
)
marked_nodes
=
set
()
for
op
in
test_graph
.
all_ops
():
for
op
in
test_graph
.
all_op
_node
s
():
if
op
.
name
().
find
(
'quantize'
)
>
-
1
:
marked_nodes
.
add
(
op
)
test_graph
.
draw
(
'.'
,
'test_mobile'
+
dev_name
+
quant_type
,
...
...
python/paddle/fluid/executor.py
浏览文件 @
c4187dbd
...
...
@@ -538,6 +538,8 @@ class Executor(object):
else
:
# TODO(panyx0718): Can compile program to optimize executor
# performance.
# TODO(panyx0718): executor should be able to run graph.
assert
program
.
_program
,
"CompiledProgram is compiled from graph, can only run with_data_parallel."
return
self
.
_run
(
program
.
_program
,
self
.
_default_executor
,
...
...
python/paddle/fluid/framework.py
浏览文件 @
c4187dbd
...
...
@@ -87,6 +87,15 @@ def _current_expected_place():
return
_imperative_current_expected_place_
def
is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
...
...
@@ -378,16 +387,19 @@ class Variable(object):
# get_capacity is implemented
pass
self
.
block
.
vars
[
name
]
=
self
self
.
op
=
None
self
.
stop_gradient
=
stop_gradient
self
.
is_data
=
is_data
if
_in_imperative_mode
():
# record vars in tracer rather than blocks
self
.
_ivar
=
kwargs
.
get
(
"ivar"
,
None
)
if
not
self
.
_ivar
:
self
.
_ivar
=
core
.
VarBase
()
self
.
_ivar
=
core
.
VarBase
(
stop_gradient
)
self
.
_ivar
.
desc
=
self
.
desc
self
.
_ivar
.
stop_gradient
=
stop_gradient
if
persistable
:
self
.
block
.
vars
[
name
]
=
self
else
:
self
.
block
.
vars
[
name
]
=
self
self
.
op
=
None
self
.
stop_gradient
=
stop_gradient
self
.
is_data
=
is_data
def
_numpy
(
self
):
new_ivar
=
self
.
_ivar
.
_copy_to
(
core
.
CPUPlace
(),
True
)
...
...
@@ -723,7 +735,6 @@ class Operator(object):
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
desc
.
check_attrs
()
if
self
.
_has_kernel
(
type
):
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
...
...
@@ -731,6 +742,7 @@ class Operator(object):
if
_in_imperative_mode
():
self
.
iop
=
core
.
OpBase
()
self
.
iop
.
desc
=
self
.
desc
self
.
inputs
=
defaultdict
(
list
)
if
inputs
is
not
None
:
for
k
,
v
in
six
.
iteritems
(
inputs
):
...
...
@@ -738,6 +750,7 @@ class Operator(object):
self
.
inputs
[
k
].
append
(
v
.
_ivar
)
elif
isinstance
(
v
,
list
)
or
isinstance
(
v
,
tuple
):
self
.
inputs
[
k
].
extend
([
var
.
_ivar
for
var
in
v
])
self
.
outputs
=
defaultdict
(
list
)
if
outputs
is
not
None
:
for
k
,
v
in
six
.
iteritems
(
outputs
):
...
...
@@ -1187,6 +1200,15 @@ class Block(object):
else
:
raise
ValueError
(
"Var {0} is not found recursively"
.
format
(
name
))
def
_clear_block
(
self
):
# TODO(minqiyang): move this to backward_hooks
self
.
desc
.
_clear_block
()
for
name
in
self
.
vars
.
keys
():
assert
self
.
vars
[
name
].
persistable
del
self
.
ops
[:]
def
all_parameters
(
self
):
return
list
(
self
.
iter_parameters
())
...
...
@@ -1317,18 +1339,31 @@ class Block(object):
inputs
=
kwargs
.
get
(
"inputs"
,
None
),
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
self
.
ops
.
append
(
op
)
# TODO(minqiyang): add stop_gradient support in static mode too.
if
_in_imperative_mode
():
# record ops in tracer rather than blocks
#
# TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in imperative mode.
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
self
.
ops
.
append
(
op
)
return
op
def
_trace_op
(
self
,
op
,
stop_gradient
=
False
):
if
_in_imperative_mode
():
_imperative_tracer
().
trace
(
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
_imperative_current_expected_place_
,
stop_gradient
)
backward_refs
=
_imperative_tracer
().
trace
(
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
_imperative_current_expected_place_
,
stop_gradient
)
# TODO(minqiyang): support backward_hooks to eager remove backward_refs
op
.
backward_refs
=
defaultdict
(
list
)
for
k
,
v
in
six
.
iteritems
(
op
.
inputs
):
if
k
in
backward_refs
:
op
.
backward_refs
[
k
]
=
op
.
inputs
[
k
]
for
k
,
v
in
six
.
iteritems
(
op
.
outputs
):
if
k
in
backward_refs
:
op
.
backward_refs
[
k
]
=
op
.
outputs
[
k
]
def
_insert_op
(
self
,
index
,
*
args
,
**
kwargs
):
"""
...
...
@@ -1383,6 +1418,7 @@ class Block(object):
outputs
=
kwargs
.
get
(
"outputs"
,
None
),
attrs
=
kwargs
.
get
(
"attrs"
,
None
))
self
.
ops
.
insert
(
0
,
op
)
if
_in_imperative_mode
():
self
.
_trace_op
(
op
,
kwargs
.
get
(
"stop_gradient"
,
False
))
return
op
...
...
@@ -1530,10 +1566,397 @@ class Block(object):
return
ret_var
class
IrNode
(
object
):
"""
Python IrNode. Beneath it is a core.Node, which is used for Ir Pass.
"""
def
__init__
(
self
,
node
):
"""
Construct an IrNode using core.Node.
Args:
node(core.Node): C++ Node.
"""
assert
isinstance
(
node
,
core
.
Node
),
'node must be the instance of core.Node.'
self
.
node
=
node
def
name
(
self
):
"""
Return the node name.
Returns:
str: node name.
"""
return
self
.
node
.
name
()
def
node_type
(
self
):
"""
Return the node type.
Returns:
core.Node.Type: node type(core.Node.Type.Operation or core.Node.Type.Variable).
"""
return
self
.
node
.
node_type
()
def
var
(
self
):
"""
Return the node variable description.
Returns:
core.VarDesc: node variable description.
"""
return
self
.
node
.
var
()
def
op
(
self
):
"""
Return the node operator description.
Returns:
core.OpDesc: node operator description.
"""
return
self
.
node
.
op
()
def
id
(
self
):
"""
Return the node id.
Returns:
int: node id.
"""
return
self
.
node
.
id
()
def
is_op
(
self
):
"""
If the node is an operator, then return true.
Returns:
bool: indicate whether the node is an operator.
"""
return
self
.
node
.
is_op
()
def
is_var
(
self
):
"""
If the node is a variable, then return true.
Returns:
bool: indicate whether the node is a variable.
"""
return
self
.
node
.
is_var
()
def
is_ctrl_var
(
self
):
"""
If the node is a control dependence variable, then return true.
Returns:
bool: indicate whether the node is a control dependence variable.
"""
return
self
.
node
.
is_ctrl_var
()
def
clear_inputs
(
self
):
"""
Clear the node inputs. After executing the `clear_inputs` function,
the node inputs will be empty.
"""
self
.
node
.
clear_inputs
()
def
remove_input_by_id
(
self
,
node_id
):
"""
Remove a node from inputs by the given node id.
Args:
node_id(int): the given node id.
"""
self
.
node
.
remove_input
(
node_id
)
def
remove_input
(
self
,
node
):
"""
Remove a node from inputs.
Args:
node(IrNode): the node being removed.
"""
self
.
node
.
remove_input
(
node
.
node
)
def
append_input
(
self
,
node
):
"""
Append a node in inputs.
Args:
node(IrNode): the node being appended.
"""
self
.
node
.
append_input
(
node
.
node
)
def
clear_outputs
(
self
):
"""
Clear the node outputs. After executing the `clear_outputs` function,
the node outputs will be empty.
"""
self
.
node
.
clear_outputs
()
def
remove_output_by_id
(
self
,
node_id
):
"""
Remove a node from outputs by the given node id.
Args:
node_id(int): the given node id.
"""
self
.
node
.
remove_output
(
node_id
)
def
remove_output
(
self
,
node
):
"""
Remove a node from outputs.
Args:
node(IrNode): the node being removed.
"""
self
.
node
.
remove_output
(
node
.
node
)
def
append_output
(
self
,
node
):
"""
Append a node in outputs.
Args:
node(IrNode): the node being appended.
"""
self
.
node
.
append_output
(
node
.
node
)
@
property
def
inputs
(
self
):
"""
Return the node inputs.
Returns:
list(IrNode): node inputs wrapped by IrNode.
"""
return
[
IrNode
(
n
)
for
n
in
self
.
node
.
inputs
]
@
property
def
outputs
(
self
):
"""
Return the node outputs.
Returns:
list(IrNode): node outputs wrapped by IrNode.
"""
return
[
IrNode
(
n
)
for
n
in
self
.
node
.
outputs
]
class
IrVarNode
(
IrNode
):
"""
Python IrVarNode. Beneath it is a core.Node, it inherits from IrNode.
"""
def
__init__
(
self
,
node
):
"""
Construct an IrVarNode using core.Node.
Args:
node(core.Node): C++ Node.
"""
assert
isinstance
(
node
,
core
.
Node
)
and
node
.
is_var
(),
\
'node must be the instance of core.Node and it must be a variable node.'
super
(
IrVarNode
,
self
).
__init__
(
node
)
self
.
node
=
node
def
set_shape
(
self
,
shape
):
"""
Set the node variable shape.
Args:
shape(list): shape to be set.
"""
assert
self
.
node
.
var
()
is
not
None
,
\
"The node variable description cannot be None."
self
.
node
.
var
().
set_shape
(
shape
)
def
persistable
(
self
):
"""
If the variable node is a persistable variable, then return true.
Returns:
bool: indicate whether the variable is persistable.
"""
assert
self
.
node
.
var
()
is
not
None
,
\
"The node variable description cannot be None."
return
self
.
node
.
var
().
persistable
()
def
type
(
self
):
"""
Return the variable type.
Returns:
core.VarDesc.VarType: the variable type.
"""
assert
self
.
node
.
var
()
is
not
None
,
\
"The node variable description cannot be None."
return
self
.
node
.
var
().
type
()
def
dtype
(
self
):
"""
Return the variable data type.
Returns:
core.VarDesc.VarType: the variable data type.
"""
assert
self
.
node
.
var
()
is
not
None
,
\
"The node variable description cannot be None."
return
self
.
node
.
var
().
dtype
()
def
shape
(
self
):
"""
Return the variable shape.
Returns:
list: the variable shape.
"""
assert
self
.
node
.
var
()
is
not
None
,
\
"The node variable description cannot be None."
return
self
.
node
.
var
().
shape
()
@
property
def
inputs
(
self
):
"""
Return the node inputs.
Returns:
list(IrOpNode): node inputs wrapped by IrOpNode.
"""
return
[
IrOpNode
(
n
)
for
n
in
self
.
node
.
inputs
]
@
property
def
outputs
(
self
):
"""
Return the node outputs.
Returns:
list(IrOpNode): node outputs wrapped by IrOpNode.
"""
return
[
IrOpNode
(
n
)
for
n
in
self
.
node
.
outputs
]
class
IrOpNode
(
IrNode
):
"""
Python IrOpNode. Beneath it is a core.Node, it inherits from IrNode.
"""
def
__init__
(
self
,
node
):
"""
Construct an IrOpNode using core.Node.
Args:
node(core.Node): C++ Node.
"""
assert
isinstance
(
node
,
core
.
Node
)
and
node
.
is_op
(),
\
'node must be the instance of core.Node and it must be a operator node.'
super
(
IrOpNode
,
self
).
__init__
(
node
)
self
.
node
=
node
def
rename_input
(
self
,
old_input_name
,
new_input_name
):
"""
Rename the input of this node.
Args:
old_input_name(str): the old input name.
new_input_name(str): the new input name.
"""
assert
self
.
node
.
op
()
is
not
None
,
\
"The node operator description cannot be None."
self
.
node
.
op
().
_rename_input
(
old_input_name
,
new_input_name
)
def
input
(
self
,
name
):
"""
Get the argument name list by the parameter name for input.
Args:
name(str): the parameter name.
Returns:
list(str): the argument name list.
"""
assert
self
.
node
.
op
()
is
not
None
,
\
"The node operator description cannot be None."
return
self
.
node
.
op
().
input
(
name
)
def
output
(
self
,
name
):
"""
Get the argument name list by the parameter name for output.
Args:
name(str): the parameter name.
Returns:
list(str): the argument name list.
"""
assert
self
.
node
.
op
()
is
not
None
,
\
"The node operator description cannot be None."
return
self
.
node
.
op
().
output
(
name
)
def
set_type
(
self
,
new_type
):
"""
Change the operator type into new type.
Args:
new_type(str): new operator type to be set.
"""
assert
self
.
node
.
op
()
is
not
None
,
\
"The node operator description cannot be None."
return
self
.
node
.
op
().
set_type
(
new_type
)
def
set_attr
(
self
,
name
,
val
):
"""
Set the value of attribute by attribute's name.
Args:
name(str): the attribute name.
val(bool|int|str|float|list): the value of the attribute.
"""
self
.
_update_desc_attr
(
name
,
val
)
def
_update_desc_attr
(
self
,
name
,
val
):
"""
Update the value of the op desc's attribute by attribute's name.
"""
assert
self
.
node
.
op
()
is
not
None
,
\
"The node operator description cannot be None."
desc
=
self
.
node
.
op
()
if
isinstance
(
val
,
Block
):
desc
.
set_block_attr
(
name
,
val
.
desc
)
elif
isinstance
(
val
,
list
)
and
val
and
\
all
(
isinstance
(
v
,
Block
)
for
v
in
val
):
desc
.
set_blocks_attr
(
name
,
[
v
.
desc
for
v
in
val
])
elif
isinstance
(
val
,
core
.
BlockDesc
)
or
\
isinstance
(
val
,
core
.
ProgramDesc
):
desc
.
set_serialized_attr
(
name
,
val
.
serialize_to_string
())
else
:
desc
.
_set_attr
(
name
,
val
)
@
property
def
inputs
(
self
):
"""
Return the node inputs.
Returns:
list(IrVarNode): node inputs wrapped by IrVarNode.
"""
return
[
IrVarNode
(
n
)
for
n
in
self
.
node
.
inputs
]
@
property
def
outputs
(
self
):
"""
Return the node outputs.
Returns:
list(IrVarNode): node outputs wrapped by IrVarNode.
"""
return
[
IrVarNode
(
n
)
for
n
in
self
.
node
.
outputs
]
class
IrGraph
(
object
):
"""
Python IrGraph. Beneath it is a core.Graph, which is used for
creat
e
a c++ Ir Pass Graph. An IrGraph is just a graph view of
creat
ing
a c++ Ir Pass Graph. An IrGraph is just a graph view of
a Program. In an IrGraph, both Variables and Operators are graph
nodes.
"""
...
...
@@ -1561,15 +1984,15 @@ class IrGraph(object):
"""
Return all nodes included in the graph as a set.
"""
return
{
node
for
node
in
self
.
graph
.
nodes
()}
return
{
IrNode
(
node
)
for
node
in
self
.
graph
.
nodes
()}
def
all_vars
(
self
):
def
all_var
_node
s
(
self
):
"""
Return all variable nodes included in the graph as a set.
"""
return
{
node
for
node
in
self
.
graph
.
nodes
()
if
node
.
is_var
()}
return
{
IrVarNode
(
node
)
for
node
in
self
.
graph
.
nodes
()
if
node
.
is_var
()}
def
all_persistable_
var
s
(
self
):
def
all_persistable_
node
s
(
self
):
"""
Return all persistable variable nodes included in the graph as a set.
"""
...
...
@@ -1578,13 +2001,13 @@ class IrGraph(object):
if
node
.
is_var
()
and
node
.
var
()
is
not
None
and
node
.
var
(
).
persistable
():
persistable_nodes
.
add
(
node
)
return
persistable_nodes
return
{
IrVarNode
(
p
)
for
p
in
persistable_nodes
}
def
all_ops
(
self
):
def
all_op
_node
s
(
self
):
"""
Return all operator nodes included in the graph as a set.
"""
return
{
node
for
node
in
self
.
graph
.
nodes
()
if
node
.
is_op
()}
return
{
IrOpNode
(
node
)
for
node
in
self
.
graph
.
nodes
()
if
node
.
is_op
()}
def
var_node
(
self
,
name
):
"""
...
...
@@ -1598,14 +2021,14 @@ class IrGraph(object):
doesn't have a variable with the giving name.
Returns:
core.
Node: the variable node with the giving name.
IrVar
Node: the variable node with the giving name.
"""
if
not
isinstance
(
name
,
six
.
string_types
):
raise
TypeError
(
"var require string as parameter, but get %s instead."
%
(
type
(
name
)))
target_var_node
=
None
var_nodes
=
self
.
all_vars
()
var_nodes
=
self
.
all_var
_node
s
()
for
var_node
in
var_nodes
:
if
var_node
.
name
()
==
name
:
target_var_node
=
var_node
...
...
@@ -1613,7 +2036,7 @@ class IrGraph(object):
raise
ValueError
(
"var_node %s not in this graph"
%
name
)
return
target_var_node
def
create_p
aram
_node
(
self
,
name
,
var_type
,
shape
,
var_dtype
):
def
create_p
ersistable
_node
(
self
,
name
,
var_type
,
shape
,
var_dtype
):
"""
Create a persistable variable node in the graph. In IrGraph,
it can not distinguish between persistable variables and parameters.
...
...
@@ -1625,14 +2048,14 @@ class IrGraph(object):
var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
Returns:
core.
Node: the created persistable variable node.
IrVar
Node: the created persistable variable node.
"""
var_desc
=
core
.
VarDesc
(
name
)
var_desc
.
set_type
(
var_type
)
var_desc
.
set_shape
(
shape
)
var_desc
.
set_dtype
(
var_dtype
)
var_desc
.
set_persistable
(
True
)
return
self
.
graph
.
create_var_node
(
var_desc
)
return
IrVarNode
(
self
.
graph
.
create_var_node
(
var_desc
)
)
def
create_var_node
(
self
,
name
,
var_type
,
shape
,
var_dtype
):
"""
...
...
@@ -1646,14 +2069,14 @@ class IrGraph(object):
var_dtype(core.VarDesc.VarType): the data type of the variable node.
Returns:
core.
Node: the created variable node.
IrVar
Node: the created variable node.
"""
var_desc
=
core
.
VarDesc
(
name
)
var_desc
.
set_type
(
var_type
)
var_desc
.
set_shape
(
shape
)
var_desc
.
set_dtype
(
var_dtype
)
return
self
.
graph
.
create_var_node
(
var_desc
)
return
IrVarNode
(
self
.
graph
.
create_var_node
(
var_desc
)
)
def
create_var_node_from_desc
(
self
,
var_desc
):
"""
...
...
@@ -1664,9 +2087,9 @@ class IrGraph(object):
var_desc(core.VarDesc): the giving variable description.
Returns:
core.
Node: the created variable node.
IrVar
Node: the created variable node.
"""
return
self
.
graph
.
create_var_node
(
var_desc
)
return
IrVarNode
(
self
.
graph
.
create_var_node
(
var_desc
)
)
def
create_op_node
(
self
,
op_type
,
attrs
,
inputs
,
outputs
):
"""
...
...
@@ -1679,7 +2102,7 @@ class IrGraph(object):
outputs(dict): the outpus of the operator node.
Returns:
core.
Node: the created operator node.
IrOp
Node: the created operator node.
"""
op_desc
=
core
.
OpDesc
()
op_desc
.
set_type
(
op_type
)
...
...
@@ -1695,7 +2118,7 @@ class IrGraph(object):
var_nodes
=
[
var_nodes
]
op_desc
.
set_output
(
output_name
,
[
var_node
.
name
()
for
var_node
in
var_nodes
])
return
self
.
graph
.
create_op_node
(
op_desc
)
return
IrOpNode
(
self
.
graph
.
create_op_node
(
op_desc
)
)
def
create_op_node_from_desc
(
self
,
op_desc
):
"""
...
...
@@ -1705,40 +2128,40 @@ class IrGraph(object):
op_desc(core.VarDesc): the giving operator description.
Returns:
core.
Node: the created operator node.
IrOp
Node: the created operator node.
"""
return
self
.
graph
.
create_op_node
(
op_desc
)
return
IrOpNode
(
self
.
graph
.
create_op_node
(
op_desc
)
)
def
update_input_link
(
self
,
old_input_node
,
new_input_node
,
op_node
):
"""
Update the input's link of a operator node.
Args:
old_input_node(
core.
Node): the old input node of the giving op_node.
new_input_node(
core.
Node): the new input node of the giving op_node.
op_node(
core.
Node): the operator node that is needed to update input's link.
old_input_node(
Ir
Node): the old input node of the giving op_node.
new_input_node(
Ir
Node): the new input node of the giving op_node.
op_node(
IrOp
Node): the operator node that is needed to update input's link.
"""
assert
old_input_node
in
self
.
graph
.
nodes
()
and
new_input_
node
in
\
self
.
graph
.
nodes
()
and
op_node
in
self
.
graph
.
nodes
(),
\
assert
old_input_node
.
node
in
self
.
graph
.
nodes
()
and
new_input_node
.
node
in
\
self
.
graph
.
nodes
()
and
op_node
.
node
in
self
.
graph
.
nodes
(),
\
'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
old_input_node
.
outputs_remove
(
op_node
)
op_node
.
inputs_remove
(
old_input_node
)
new_input_node
.
outputs_append
(
op_node
)
op_node
.
inputs_append
(
new_input_node
)
op_node
.
op
().
_
rename_input
(
old_input_node
.
name
(),
new_input_node
.
name
())
old_input_node
.
remove_output
(
op_node
)
op_node
.
remove_input
(
old_input_node
)
new_input_node
.
append_output
(
op_node
)
op_node
.
append_input
(
new_input_node
)
op_node
.
rename_input
(
old_input_node
.
name
(),
new_input_node
.
name
())
def
link_to
(
self
,
node_in
,
node_out
):
"""
Connect two nodes.
Args:
node_in(
core.
Node): the input node.
node_out(
core.
Node): the output node.
node_in(
Ir
Node): the input node.
node_out(
Ir
Node): the output node.
"""
assert
node_in
in
self
.
graph
.
nodes
()
and
node_out
in
self
.
graph
.
nodes
(),
\
assert
node_in
.
node
in
self
.
graph
.
nodes
()
and
node_out
.
node
in
self
.
graph
.
nodes
(),
\
'The two arguments(node_in&node_out) must be in the graph nodes.'
node_in
.
outputs_append
(
node_out
)
node_out
.
inputs_append
(
node_in
)
node_in
.
append_output
(
node_out
)
node_out
.
append_input
(
node_in
)
def
safe_remove_nodes
(
self
,
remove_nodes
):
"""
...
...
@@ -1753,7 +2176,8 @@ class IrGraph(object):
remove_nodes
=
set
(
remove_nodes
)
else
:
remove_nodes
=
{
remove_nodes
}
core
.
graph_safe_remove_nodes
(
self
.
graph
,
remove_nodes
)
original_nodes
=
{
n
.
node
for
n
in
remove_nodes
}
core
.
graph_safe_remove_nodes
(
self
.
graph
,
original_nodes
)
def
has_circle
(
self
):
"""
...
...
@@ -1780,18 +2204,23 @@ class IrGraph(object):
Notes: the `graph` cannot contain a circle.
Returns:
set(
core.
Node): nodes in topology order.
set(
Ir
Node): nodes in topology order.
"""
return
core
.
topology_sort
(
self
.
graph
)
ordered_nodes
=
core
.
topology_sort
(
self
.
graph
)
return
{
IrNode
(
n
)
for
n
in
ordered_nodes
}
def
build_adjacency_list
(
self
):
"""
Build an adjacency list of operations for the `graph`.
Returns:
dict{
core.Node: set(core.
Node)}: the adjacency list.
dict{
IrNode: set(Ir
Node)}: the adjacency list.
"""
return
core
.
build_adjacency_list
(
self
.
graph
)
adj_list
=
core
.
build_adjacency_list
(
self
.
graph
)
wrapped_adj_list
=
dict
()
for
k
,
v
in
six
.
iteritems
(
adj_list
):
wrapped_adj_list
[
IrNode
(
k
)]
=
{
IrNode
(
n
)
for
n
in
v
}
return
wrapped_adj_list
def
draw
(
self
,
save_path
,
name
,
marked_nodes
=
None
,
remove_ctr_var
=
True
):
"""
...
...
@@ -1801,7 +2230,7 @@ class IrGraph(object):
Args:
save_path(str): the save path of drawn graph.
name(str): the name of drawn graph.
marked_nodes(set(
core.
Node)): nodes that are needed to be marked.
marked_nodes(set(
Ir
Node)): nodes that are needed to be marked.
Default value is None.
remove_ctr_var(bool): If it is set True, all control variable nodes
in the graph will be removed. Default value is True.
...
...
@@ -1816,20 +2245,22 @@ class IrGraph(object):
print
(
'The {} is saved as the dot filetype.'
.
format
(
dot_file_path
))
if
remove_ctr_var
:
remove_ctr_vars
=
set
()
for
node
in
self
.
graph
.
nodes
():
if
remove_ctr_var
:
for
node
in
self
.
all_var_nodes
():
if
node
.
is_ctrl_var
():
remove_ctr_vars
.
add
(
node
)
self
.
safe_remove_nodes
(
remove_ctr_vars
)
ops_num
=
0
for
node
in
self
.
graph
.
nodes
():
if
node
.
is_op
():
ops_num
+=
1
print
(
'Total ops num = {}.'
.
format
(
ops_num
))
print
(
'Total ops num = {}.'
.
format
(
len
(
self
.
all_op_nodes
())))
if
marked_nodes
is
not
None
:
if
not
isinstance
(
marked_nodes
,
set
):
if
isinstance
(
marked_nodes
,
Iterable
):
marked_nodes
=
set
(
marked_nodes
)
else
:
marked_nodes
=
{
marked_nodes
}
marked_nodes
=
{
n
.
node
for
n
in
marked_nodes
}
remove_ctr_vars
=
{
n
.
node
for
n
in
remove_ctr_vars
}
marked_nodes
=
marked_nodes
-
remove_ctr_vars
if
self
.
graph
.
has
(
'__graphviz__marked_node__'
):
self
.
graph
.
erase
(
'__graphviz__marked_node__'
)
...
...
python/paddle/fluid/imperative/layers.py
浏览文件 @
c4187dbd
...
...
@@ -17,7 +17,7 @@ import contextlib
import
sys
import
numpy
as
np
import
collections
from
..
import
unique_name
from
paddle.fluid
import
core
from
paddle.fluid
import
framework
from
paddle.fluid.imperative
import
base
...
...
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']
class
Layer
(
core
.
Layer
):
"""Layers composed of operators."""
"""Layers composed of operators.
Args:
name_scope: prefix name used by the layer to name parameters.
If prefix is "my_model/layer_1", parameter name in MyLayer
can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
base name and n is an unique suffix auto-generated.
dtype: data type for the variables in the layer.
"""
def
__init__
(
self
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
name
=
None
):
def
__init__
(
self
,
name_scope
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
self
.
_full_name
=
unique_name
.
generate
(
name_scope
+
"/"
+
self
.
__class__
.
__name__
)
self
.
_built
=
False
self
.
_dtype
=
dtype
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
def
full_name
(
self
):
"""Full name for this layers.
Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
Returns full name of this name.
"""
return
self
.
_full_name
def
parameters
(
self
,
include_sublayers
=
True
):
"""Returns a list of Parameters from current and sub-layers.
...
...
python/paddle/fluid/imperative/nn.py
浏览文件 @
c4187dbd
...
...
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
class
Conv2D
(
layers
.
Layer
):
def
__init__
(
self
,
name_scope
,
num_channels
,
num_filters
,
filter_size
,
...
...
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
act
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
name
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
assert
param_attr
is
not
False
,
"param_attr should not be False here."
super
(
Conv2D
,
self
).
__init__
(
name
=
nam
e
,
dtype
=
dtype
)
super
(
Conv2D
,
self
).
__init__
(
name
_scop
e
,
dtype
=
dtype
)
# TODO(minqiyang): Move this to the top.
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
type
(
self
).
__name__
,
self
.
full_name
()
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
dtype
=
dtype
,
name
=
name
,
act
=
act
)
self
.
_groups
=
groups
...
...
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
class
Pool2D
(
layers
.
Layer
):
def
__init__
(
self
,
name_scope
,
pool_size
=-
1
,
pool_type
=
"max"
,
pool_stride
=
1
,
...
...
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
use_cudnn
=
True
,
ceil_mode
=
False
,
exclusive
=
True
,
name
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
...
...
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
if
not
isinstance
(
use_cudnn
,
bool
):
raise
ValueError
(
"use_cudnn should be True or False"
)
super
(
Pool2D
,
self
).
__init__
(
name
=
nam
e
,
dtype
=
dtype
)
super
(
Pool2D
,
self
).
__init__
(
name
_scop
e
,
dtype
=
dtype
)
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
type
(
self
).
__name__
,
dtype
=
dtype
,
name
=
nam
e
)
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
dtype
=
dtyp
e
)
self
.
_pool_type
=
pool_type
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
...
...
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
class
FC
(
layers
.
Layer
):
def
__init__
(
self
,
name_scope
,
size
,
param_attr
=
None
,
bias_attr
=
None
,
num_flatten_dims
=
1
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
act
=
None
,
name
=
None
):
super
(
FC
,
self
).
__init__
()
act
=
None
):
super
(
FC
,
self
).
__init__
(
name_scope
)
self
.
_size
=
size
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_dtype
=
dtype
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'FC'
,
self
.
full_name
()
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
,
name
=
name
)
act
=
act
)
def
_build_once
(
self
,
input
):
input_shape
=
input
.
shape
...
...
@@ -282,6 +280,7 @@ class FC(layers.Layer):
class
BatchNorm
(
layers
.
Layer
):
def
__init__
(
self
,
name_scope
,
num_channels
,
act
=
None
,
is_test
=
False
,
...
...
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
data_layout
=
'NCHW'
,
in_place
=
False
,
name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
do_model_average_for_mean_and_var
=
False
,
fuse_with_relu
=
False
,
use_global_stats
=
False
):
super
(
BatchNorm
,
self
).
__init__
()
super
(
BatchNorm
,
self
).
__init__
(
name_scope
)
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'batch_norm'
,
self
.
full_name
()
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
name
=
name
,
act
=
act
)
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
...
...
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
constructor.
Args:
name_scope: See base class.
size(tuple|list): The shape of the look up table parameter. It should
have two elements which indicate the size of the dictionary of
embeddings and the size of each embedding vector respectively.
...
...
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
"""
def
__init__
(
self
,
name_scope
,
size
,
is_sparse
=
False
,
is_distributed
=
False
,
...
...
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
param_attr
=
None
,
dtype
=
'float32'
):
super
(
Embedding
,
self
).
__init__
()
super
(
Embedding
,
self
).
__init__
(
name_scope
)
self
.
_size
=
size
self
.
_is_sparse
=
is_sparse
self
.
_is_distributed
=
is_distributed
...
...
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'embedding'
,
param_attr
=
param_attr
)
self
.
_helper
=
LayerHelper
(
self
.
full_name
()
,
param_attr
=
param_attr
)
self
.
_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
self
.
_size
,
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
c4187dbd
...
...
@@ -34,6 +34,9 @@ class LayerHelper(object):
self
.
kwargs
=
kwargs
self
.
layer_type
=
layer_type
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
# TODO(panyx0718, minqiyang): imperative mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode.
if
name
is
None
:
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
self
.
layer_type
)
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
c4187dbd
...
...
@@ -551,9 +551,10 @@ def yolov3_loss(x,
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
"""
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
c4187dbd
...
...
@@ -87,6 +87,7 @@ __all__ = [
'transpose'
,
'im2sequence'
,
'nce'
,
'sampled_softmax_with_cross_entropy'
,
'hsigmoid'
,
'beam_search'
,
'row_conv'
,
...
...
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
candidate_activation
=
'tanh'
,
proj_activation
=
'tanh'
,
dtype
=
'float32'
,
name
=
None
):
name
=
None
,
h_0
=
None
,
c_0
=
None
,
cell_clip
=
None
,
proj_clip
=
None
):
"""
**Dynamic LSTMP Layer**
...
...
@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the projection size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
cell_clip(float): If provided the cell state is clipped
by this value prior to the cell output activation.
proj_clip(float): If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`.
Returns:
tuple: A tuple of two output variable: the projection of hidden state,
\
...
...
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
batch_hidden
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_gate
=
helper
.
create_variable_for_type_inference
(
dtype
)
batch_cell_pre_act
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
type
=
'lstmp'
,
inputs
=
{
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'ProjWeight'
:
proj_weight
,
'Bias'
:
bias
},
}
batch_size
=
input
.
shape
[
0
]
if
h_0
:
assert
h_0
.
shape
==
(
batch_size
,
proj_size
),
\
'The shape of h0 should be (batch_size, %d)'
%
proj_size
inputs
[
'H0'
]
=
h_0
if
c_0
:
assert
c_0
.
shape
==
(
batch_size
,
size
),
\
'The shape of c0 should be (batch_size, %d)'
%
size
inputs
[
'C0'
]
=
c_0
if
cell_clip
:
assert
cell_clip
>=
0
,
"cell_clip should not be negtive."
if
proj_clip
:
assert
proj_clip
>=
0
,
"proj_clip should not be negtive."
helper
.
append_op
(
type
=
'lstmp'
,
inputs
=
inputs
,
outputs
=
{
'Projection'
:
projection
,
'Cell'
:
cell
,
'OrderedP0'
:
ordered_proj0
,
'BatchHidden'
:
batch_hidden
,
'BatchGate'
:
batch_gate
,
'BatchCellPreAct'
:
batch_cell_pre_act
},
attrs
=
{
'use_peepholes'
:
use_peepholes
,
'cell_clip'
:
cell_clip
,
'proj_clip'
:
proj_clip
,
'is_reverse'
:
is_reverse
,
'gate_activation'
:
gate_activation
,
'cell_activation'
:
cell_activation
,
...
...
@@ -2441,7 +2473,7 @@ def pool2d(input,
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
conv
2d = fluid.layers.pool2d(
pool
2d = fluid.layers.pool2d(
input=data,
pool_size=2,
pool_type='max',
...
...
@@ -2490,6 +2522,7 @@ def pool2d(input,
return
pool_out
@
templatedoc
()
def
pool3d
(
input
,
pool_size
=-
1
,
pool_type
=
"max"
,
...
...
@@ -2501,13 +2534,19 @@ def pool3d(input,
name
=
None
,
exclusive
=
True
):
"""
This function adds the operator for pooling in 3-dimensions, using the
pooling configurations mentioned in input parameters.
${comment}
Args:
input (Variable): ${input_comment}
pool_size (int): ${ksize_comment}
pool_type (str): ${pooling_type_comment}
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCDHW, where N is batch size, C is
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment}
...
...
@@ -2520,6 +2559,19 @@ def pool3d(input,
Returns:
Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
...
...
@@ -2569,7 +2621,27 @@ def adaptive_pool2d(input,
require_index
=
False
,
name
=
None
):
"""
${comment}
**Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &=
\\
frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
...
...
@@ -2579,8 +2651,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
i
t cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point
will be returned along
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
...
...
@@ -2661,18 +2733,42 @@ def adaptive_pool3d(input,
require_index
=
False
,
name
=
None
):
"""
${comment}
**Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &=
\\
frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
the number of channels,
H is the height of the
feature, and W is the width of the feature.
input tensor is NC
D
HW, where N is batch size, C is
the number of channels,
D is the depth of the feature,
H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain t
wo
integers, (Depth, Height, Width).
it must contain t
hree
integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
i
t cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point
will be returned along
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
...
...
@@ -2709,7 +2805,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
pool_size=[3, 3],
pool_size=[3, 3
, 3
],
pool_type='avg')
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
...
...
@@ -2945,7 +3041,6 @@ def data_norm(input,
param_attr
=
None
,
data_layout
=
'NCHW'
,
in_place
=
False
,
use_mkldnn
=
False
,
name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
...
...
@@ -2979,7 +3074,6 @@ def data_norm(input,
param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
data_layout(string, default NCHW): NCHW|NHWC
in_place(bool, Default False): Make the input and output of batch norm reuse memory.
use_mkldnn(bool, Default false): ${use_mkldnn_comment}
name(string, Default None): A name for this layer(optional). If set None, the layer
will be named automatically.
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
...
...
@@ -3060,8 +3154,7 @@ def data_norm(input,
outputs
=
{
"Y"
:
data_norm_out
,
"Means"
:
means
,
"Scales"
:
scales
},
attrs
=
{
"epsilon"
:
epsilon
,
"use_mkldnn"
:
use_mkldnn
})
attrs
=
{
"epsilon"
:
epsilon
})
return
helper
.
append_activation
(
data_norm_out
)
...
...
@@ -5765,6 +5858,132 @@ def softmax_with_cross_entropy(logits,
return
loss
def
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
,
num_true
=
1
,
remove_accidental_hits
=
True
,
use_customized_samples
=
False
,
customized_samples
=
None
,
customized_probabilities
=
None
,
seed
=
0
):
"""
**Sampled Softmax With Cross Entropy Operator.**
Cross entropy loss with sampled softmax is used as the output layer for
larger output classes extensively. This operator samples a number of samples
for all examples, and computes the softmax normalized values for each
row of the sampled tensor, after which cross-entropy loss is computed.
Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results.
For examples with T true labels (T >= 1), we assume that each true label has
a probability of 1/T. For each sample, S samples are generated using a
log uniform distribution. True labels are concatenated with these samples to
form T + S samples for each example. So, assume the shape of logits is
[N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
probability is calculated, which corresponds to the Q(y|x) in
[Jean et al., 2014](http://arxiv.org/abs/1412.2007).
Logits are sampled according to the sampled labels. Then if
remove_accidental_hits is True, if a sample[i, j] accidentally hits true
labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
make its softmax result close to zero. Then sampled logits are subtracted by
logQ(y|x), these sampled logits and re-indexed labels are used to compute
a softmax with cross entropy.
Args:
logits (Variable): The unscaled log probabilities, which is a 2-D tensor
with shape [N x K]. N is the batch_size, and K is the class number.
label (Variable): The ground truth which is a 2-D tensor. Label is a
Tensor<int64> with shape [N x T], where T is the number of true
labels per example.
num_samples (int): The number for each example, num_samples should be
less than the number of class.
num_true(int): The number of target classes per training example.
remove_accidental_hits (bool): A flag indicating whether to remove
accidental hits when sampling. If True and if a sample[i, j]
accidentally hits true labels, then the corresponding
sampled_logits[i, j] is minus by 1e20 to make its softmax result
close to zero. Default is True.
use_customized_samples (bool): Whether to use custom samples and probabities to sample
logits.
customized_samples (Variable): User defined samples, which is a 2-D tensor
with shape [N, T + S]. S is the num_samples, and T is the number of true
labels per example.
customized_probabilities (Variable): User defined probabilities of samples,
a 2-D tensor which has the same shape with customized_samples.
seed (int): The random seed for generating random number, which is used
in the process of sampling. Default is 0.
Returns:
Variable: Return the cross entropy loss which is a 2-D tensor with shape
[N x 1].
Examples:
.. code-block:: python
logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
label = fluid.layers.data(name='label', shape=[5], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.sampled_softmax_with_cross_entropy(
logits=fc, label=label, num_samples=25)
"""
helper
=
LayerHelper
(
'sample_logits'
,
**
locals
())
samples
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
probabilities
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_logits
\
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_label
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
sampled_softlabel
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'sample_logits'
,
inputs
=
{
'Logits'
:
logits
,
'Labels'
:
label
,
'CustomizedSamples'
:
customized_samples
,
'CustomizedProbabilities'
:
customized_probabilities
},
outputs
=
{
'Samples'
:
samples
,
'Probabilities'
:
probabilities
,
'SampledLabels'
:
sampled_label
,
'SampledLogits'
:
sampled_logits
},
attrs
=
{
'use_customized_samples'
:
use_customized_samples
,
'uniq'
:
True
,
'remove_accidental_hits'
:
remove_accidental_hits
,
'num_samples'
:
num_samples
,
'seed'
:
seed
})
loss
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
softmax
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'one_hot'
,
inputs
=
{
'X'
:
sampled_label
},
attrs
=
{
'depth'
:
num_samples
+
1
},
outputs
=
{
'Out'
:
sampled_softlabel
})
helper
.
append_op
(
type
=
'softmax_with_cross_entropy'
,
inputs
=
{
'Logits'
:
sampled_logits
,
'Label'
:
sampled_softlabel
},
outputs
=
{
'Softmax'
:
softmax
,
'Loss'
:
loss
},
attrs
=
{
'soft_label'
:
True
,
'ignore_index'
:
False
,
'numeric_stable_mode'
:
False
})
return
loss
/
num_true
def
smooth_l1
(
x
,
y
,
inside_weight
=
None
,
outside_weight
=
None
,
sigma
=
None
):
"""
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
...
...
@@ -9723,6 +9942,7 @@ def teacher_student_sigmoid_loss(input,
Examples:
.. code-block:: python
cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
"""
helper
=
LayerHelper
(
'teacher_student_sigmoid_loss'
,
**
locals
())
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
c4187dbd
...
...
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
_uniform_random_
=
generate_layer_fn
(
'uniform_random'
)
def
uniform_random
(
shape
,
dtype
=
None
,
min
=
None
,
max
=
None
,
seed
=
None
):
def
uniform_random
(
shape
,
dtype
=
'float32'
,
min
=-
1.0
,
max
=
1.0
,
seed
=
0
):
"""
This operator initializes a variable with random values sampled from a
uniform distribution. The random result is in set [min, max].
Args:
shape (list): The shape of output variable.
dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
float32, float64 etc. Default: float32.
min (float): Minimum value of uniform random. Default -1.0.
max (float): Maximun value of uniform random. Default 1.0.
seed (int): Random seed used for generating samples. 0 means use a
seed generated by the system. Note that if seed is not 0, this
operator will always generate the same random numbers every time.
Default 0.
Examples:
.. code-block:: python
result = fluid.layers.uniform_random(shape=[32, 784])
"""
locals_var
=
locals
().
keys
()
if
not
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
dtype
=
convert_np_dtype_to_dtype_
(
dtype
)
...
...
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
return
_uniform_random_
(
**
kwargs
)
uniform_random
.
__doc__
=
_uniform_random_
.
__doc__
+
"""
Examples:
>>> result = fluid.layers.uniform_random(shape=[32, 784])
"""
__all__
+=
[
'hard_shrink'
]
_hard_shrink_
=
generate_layer_fn
(
'hard_shrink'
)
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
c4187dbd
...
...
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
ParallelExecutor
(
object
):
"""
ParallelExecutor is designed for data parallelism, which focuses on distributing
...
...
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy
.
is_distribution
=
_
is_pserver_mode
(
build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
...
...
@@ -185,10 +176,13 @@ class ParallelExecutor(object):
places
=
list
(
map
(
place_obj
,
self
.
_places
))
# step7: init ParallelExecutor
# ParallelExecutor API will be deprecated, don't support parallel graph.
self
.
_graph
=
core
.
Graph
(
main
.
desc
)
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
main
.
desc
,
places
,
persistable_vars
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
)
local_scopes
,
exec_strategy
,
build_strategy
,
self
.
_graph
)
self
.
scope
=
scope
...
...
python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
0 → 100644
浏览文件 @
c4187dbd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
six
import
unittest
import
time
import
math
import
multiprocessing
import
numpy
as
np
import
paddle
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
# open eager delete mode
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
'0.0'
os
.
environ
[
'FLAGS_fast_eager_deletion_mode'
]
=
'true'
os
.
environ
[
'CPU_NUM'
]
=
'2'
class
BuildIrMemOptBase
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
network
,
use_cuda
=
True
,
memory_opt
=
True
,
use_ir_memory_optimize
=
True
,
enable_inplace
=
True
,
iter
=
5
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
print
(
'Skip use_cuda=True because Paddle is not compiled with cuda'
)
return
if
os
.
name
==
'nt'
:
print
(
'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
)
return
fluid
.
default_startup_program
().
random_seed
=
100
fluid
.
default_main_program
().
random_seed
=
100
batch_size
=
32
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
if
use_cuda
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
# build network
word_dict
=
paddle
.
dataset
.
imdb
.
word_dict
()
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
imdb
.
train
(
word_dict
),
batch_size
=
batch_size
)
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
cost
=
network
(
data
,
label
,
len
(
word_dict
))
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
optimizer
.
minimize
(
cost
)
if
memory_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
# execution
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
data
,
label
],
place
=
place
)
reader
=
feeder
.
decorate_reader
(
train_reader
,
multi_devices
=
True
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
train_cp
=
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
train_cp
=
train_cp
.
with_data_parallel
(
loss_name
=
cost
.
name
)
fetch_list
=
[
cost
.
name
]
begin
=
time
.
time
()
first_loss
,
last_loss
=
None
,
None
step_id
=
0
custom_iter
=
getattr
(
self
,
"iter"
,
None
)
if
not
custom_iter
==
None
:
iter
=
custom_iter
for
data
in
reader
():
ret
=
exe
.
run
(
train_cp
,
feed
=
data
,
fetch_list
=
fetch_list
)
print
(
ret
)
step_id
+=
1
if
step_id
==
1
:
first_loss
=
ret
[
0
]
if
step_id
==
iter
:
last_loss
=
ret
[
0
]
break
end
=
time
.
time
()
print
(
"%.4f Instance per second"
%
(
(
batch_size
*
iter
)
/
(
end
-
begin
)))
print
(
first_loss
,
last_loss
)
avg_last_loss_val
=
np
.
array
(
last_loss
).
mean
()
avg_first_loss_val
=
np
.
array
(
first_loss
).
mean
()
if
math
.
isnan
(
float
(
avg_last_loss_val
))
or
math
.
isnan
(
float
(
avg_first_loss_val
)):
sys
.
exit
(
"got NaN loss, training failed."
)
return
first_loss
,
last_loss
class
TestIrMemOptBase
(
BuildIrMemOptBase
):
def
setUp
(
self
):
self
.
network
=
None
def
test_network
(
self
):
if
self
.
network
is
None
or
not
core
.
is_compiled_with_cuda
():
return
baseline_first_loss
,
baseline_last_loss
=
None
,
None
for
use_cuda
in
[
True
]:
for
use_python_mem_opt
in
[
True
,
False
]:
print
(
'network: {}, use_cuda: {}, use_python_mem_opt: {}, use_ir_mem_opt : {}'
.
format
(
self
.
network
.
__name__
,
use_cuda
,
use_python_mem_opt
,
not
use_python_mem_opt
))
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
with
fluid
.
scope_guard
(
core
.
Scope
()):
if
use_cuda
is
True
and
use_python_mem_opt
is
True
:
baseline_first_loss
,
baseline_last_loss
=
self
.
check_network_convergence
(
self
.
network
,
use_cuda
=
use_cuda
,
memory_opt
=
use_python_mem_opt
)
else
:
cur_first_loss
,
cur_last_loss
=
self
.
check_network_convergence
(
self
.
network
,
use_cuda
=
use_cuda
,
memory_opt
=
use_python_mem_opt
)
self
.
assertAlmostEquals
(
np
.
mean
(
baseline_last_loss
),
np
.
mean
(
cur_last_loss
),
delta
=
1e-2
)
self
.
assertAlmostEquals
(
np
.
mean
(
baseline_first_loss
),
np
.
mean
(
cur_first_loss
),
delta
=
1e-2
)
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
0 → 100644
浏览文件 @
c4187dbd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
def
check_if_mkldnn_primitives_exist_in_bwd
(
test_case
,
op_type
,
x
,
out
,
out_grad
,
x_grad
):
def
__assert_close
(
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
test_case
.
assertTrue
(
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
),
msg
)
place
=
core
.
CPUPlace
()
var_dict
=
{
'x'
:
x
,
'out'
:
out
,
'out@GRAD'
:
out_grad
,
'x@GRAD'
:
x_grad
}
var_names
=
list
(
var_dict
.
keys
())
ground_truth
=
{
name
:
var_dict
[
name
]
for
name
in
var_names
}
program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
program
):
block
=
program
.
global_block
()
for
name
in
ground_truth
:
block
.
create_var
(
name
=
name
,
dtype
=
np
.
float32
,
shape
=
ground_truth
[
name
].
shape
)
op
=
block
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
block
.
var
(
'x'
),
},
outputs
=
{
'Out'
:
block
.
var
(
'out'
)},
attrs
=
{
'use_mkldnn'
:
True
})
# Generate backward op_desc
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
op
.
desc
,
set
(),
[])
grad_op_desc
=
grad_op_desc_list
[
0
]
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
grad_op_desc
)
for
var_name
in
grad_op_desc
.
output_arg_names
():
block
.
desc
.
var
(
var_name
.
encode
(
'ascii'
))
grad_op_desc
.
infer_var_type
(
block
.
desc
)
grad_op_desc
.
infer_shape
(
block
.
desc
)
for
arg
in
grad_op_desc
.
output_arg_names
():
grad_var
=
block
.
desc
.
find_var
(
arg
.
encode
(
'ascii'
))
grad_var
.
set_dtype
(
core
.
VarDesc
.
VarType
.
FP32
)
exe
=
fluid
.
Executor
(
place
)
# Do at least 2 iterations
for
i
in
range
(
2
):
out
=
exe
.
run
(
program
,
feed
=
{
name
:
var_dict
[
name
]
for
name
in
[
'x'
,
'out@GRAD'
]},
fetch_list
=
[
'x@GRAD'
,
'out'
])
__assert_close
(
x_grad
,
out
[
0
],
'x@GRAD'
)
python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
浏览文件 @
c4187dbd
...
...
@@ -19,7 +19,7 @@ import numpy as np
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_activation_op
import
TestRelu
,
TestTanh
,
TestSqrt
,
TestAbs
import
paddle.fluid
as
flui
d
from
mkldnn_op_test
import
check_if_mkldnn_primitives_exist_in_bw
d
class
TestMKLDNNReluDim2
(
TestRelu
):
...
...
@@ -98,62 +98,24 @@ class TestMKLDNNAbsDim4(TestAbs):
# Check if primitives already exist in backward
class
TestMKLDNNReluPrimitivesAlreadyExist
(
unittest
.
TestCase
):
def
__assert_close
(
self
,
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
self
.
assertTrue
(
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
),
msg
)
def
test_check_forward_backward
(
self
):
place
=
core
.
CPUPlace
()
class
TestMKLDNNAbsPrimitivesAlreadyExist
(
unittest
.
TestCase
):
def
setUp
(
self
):
super
(
TestMKLDNNAbsPrimitivesAlreadyExist
,
self
).
setUp
()
np
.
random
.
seed
(
123
)
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
2
,
2
]).
astype
(
np
.
float32
)
out
=
np
.
abs
(
x
)
out_grad
=
np
.
random
.
random_sample
(
x
.
shape
).
astype
(
np
.
float32
)
x_grad
=
out_grad
*
np
.
sign
(
x
)
# Abs grad calculation
var_dict
=
{
'x'
:
x
,
'out'
:
out
,
'out@GRAD'
:
out_grad
,
'x@GRAD'
:
x_grad
}
var_names
=
list
(
var_dict
.
keys
())
ground_truth
=
{
name
:
var_dict
[
name
]
for
name
in
var_names
}
program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
program
):
block
=
program
.
global_block
()
for
name
in
ground_truth
:
block
.
create_var
(
name
=
name
,
dtype
=
'float32'
,
shape
=
ground_truth
[
name
].
shape
)
relu_op
=
block
.
append_op
(
type
=
"abs"
,
inputs
=
{
"X"
:
block
.
var
(
'x'
),
},
outputs
=
{
"Out"
:
block
.
var
(
'out'
)},
attrs
=
{
"use_mkldnn"
:
True
})
# Generate backward op_desc
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
relu_op
.
desc
,
set
(),
[])
grad_op_desc
=
grad_op_desc_list
[
0
]
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
grad_op_desc
)
for
var_name
in
grad_op_desc
.
output_arg_names
():
block
.
desc
.
var
(
var_name
.
encode
(
"ascii"
))
grad_op_desc
.
infer_var_type
(
block
.
desc
)
grad_op_desc
.
infer_shape
(
block
.
desc
)
for
arg
in
grad_op_desc
.
output_arg_names
():
grad_var
=
block
.
desc
.
find_var
(
arg
.
encode
(
"ascii"
))
grad_var
.
set_dtype
(
core
.
VarDesc
.
VarType
.
FP32
)
exe
=
fluid
.
Executor
(
place
)
# Do at least 2 iterations
for
i
in
range
(
2
):
out
=
exe
.
run
(
program
,
feed
=
{
name
:
var_dict
[
name
]
for
name
in
[
'x'
,
'out@GRAD'
]},
fetch_list
=
[
'x@GRAD'
])
self
.
__assert_close
(
x_grad
,
out
[
0
],
"x@GRAD"
)
self
.
op_type
=
'abs'
self
.
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
2
,
2
]).
astype
(
np
.
float32
)
self
.
out
=
np
.
abs
(
self
.
x
)
self
.
out_grad
=
np
.
random
.
random_sample
(
self
.
x
.
shape
).
astype
(
np
.
float32
)
self
.
x_grad
=
self
.
__abs_bwd
(
self
.
x
,
self
.
out_grad
)
# Abs grad calculation
def
__abs_bwd
(
self
,
x
,
out_grad
):
return
out_grad
*
np
.
sign
(
x
)
def
test_check
(
self
):
check_if_mkldnn_primitives_exist_in_bwd
(
self
,
self
.
op_type
,
self
.
x
,
self
.
out
,
self
.
out_grad
,
self
.
x_grad
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
0 → 100644
浏览文件 @
c4187dbd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.test_softmax_op
import
TestSoftmaxOp
,
stable_softmax
from
mkldnn_op_test
import
check_if_mkldnn_primitives_exist_in_bwd
class
TestSoftmaxMKLDNNOp
(
TestSoftmaxOp
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
class
TestSoftmaxMKLDNNOp2
(
TestSoftmaxMKLDNNOp
):
def
get_x_shape
(
self
):
return
[
2
,
3
,
4
,
5
]
# Check if primitives already exist in backward
class
TestSoftmaxMKLDNNPrimitivesAlreadyExist
(
unittest
.
TestCase
):
def
setUp
(
self
):
super
(
TestSoftmaxMKLDNNPrimitivesAlreadyExist
,
self
).
setUp
()
np
.
random
.
seed
(
123
)
self
.
op_type
=
'softmax'
self
.
x
=
np
.
random
.
uniform
(
-
1
,
1
,
2
).
astype
(
np
.
float32
)
self
.
out
=
stable_softmax
(
self
.
x
)
self
.
out_grad
=
np
.
random
.
random_sample
(
self
.
x
.
shape
).
astype
(
np
.
float32
)
self
.
x_grad
=
self
.
__softmax_bwd
(
self
.
out
,
self
.
out_grad
)
# Softmax grad calculation
def
__softmax_bwd
(
self
,
out
,
out_grad
):
return
out
*
(
out_grad
-
np
.
dot
(
out
,
out_grad
))
def
test_check
(
self
):
check_if_mkldnn_primitives_exist_in_bwd
(
self
,
self
.
op_type
,
self
.
x
,
self
.
out
,
self
.
out_grad
,
self
.
x_grad
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
0 → 100644
浏览文件 @
c4187dbd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestAllocContinuousSpace
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"alloc_continuous_space"
self
.
dtype
=
np
.
float32
attrs
=
self
.
init_attr
()
self
.
copy_data
=
attrs
[
"copy_data"
]
self
.
constant
=
attrs
[
"constant"
]
self
.
set_constant
=
attrs
[
"set_constant"
]
self
.
Inputs
=
self
.
init_input
()
self
.
FusedOutput
=
self
.
init_output
(
self
.
Inputs
,
self
.
set_constant
,
self
.
constant
)
self
.
inputs
=
{
'Input'
:
self
.
Inputs
}
self
.
attrs
=
attrs
self
.
outputs
=
{
'Output'
:
self
.
Inputs
,
'FusedOutput'
:
self
.
FusedOutput
}
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float32
def
init_input
(
self
):
inputs
=
[]
inputs
.
append
((
"x1"
,
np
.
random
.
random
([
20
,
3
]).
astype
(
self
.
dtype
)))
inputs
.
append
((
"x2"
,
np
.
random
.
random
([
20
]).
astype
(
self
.
dtype
)))
inputs
.
append
((
"x3"
,
np
.
random
.
random
([
1
]).
astype
(
self
.
dtype
)))
inputs
.
append
((
"x4"
,
np
.
random
.
random
([
200
,
30
]).
astype
(
self
.
dtype
)))
inputs
.
append
((
"x5"
,
np
.
random
.
random
([
30
]).
astype
(
self
.
dtype
)))
inputs
.
append
((
"x6"
,
np
.
random
.
random
([
1
]).
astype
(
self
.
dtype
)))
return
inputs
def
init_attr
(
self
):
return
{
"copy_data"
:
True
,
"set_constant"
:
False
,
"constant"
:
0.0
}
def
init_output
(
self
,
input_list
,
set_constant
,
constant
):
inputs
=
[
input
[
1
].
flatten
()
for
input
in
input_list
]
output
=
np
.
concatenate
(
inputs
)
if
set_constant
:
output
=
np
.
ones
((
len
(
output
)))
*
constant
return
output
def
test_check_output
(
self
):
self
.
check_output
()
class
TestAllocContinuousSpace2
(
TestAllocContinuousSpace
):
def
init_attr
(
self
):
return
{
"copy_data"
:
False
,
"set_constant"
:
True
,
"constant"
:
0.5
}
def
test_check_output
(
self
):
self
.
check_output
(
no_check_set
=
[
"Output"
])
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_base_layer.py
浏览文件 @
c4187dbd
...
...
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper
class
L1
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
L1
,
self
).
__init__
()
def
__init__
(
self
,
prefix
):
super
(
L1
,
self
).
__init__
(
prefix
)
self
.
_helper
=
LayerHelper
(
'MyLayer'
,
self
.
full_name
()
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
...
...
@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):
class
L2
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
L2
,
self
).
__init__
()
self
.
layer1
=
L1
()
self
.
layer2
=
L1
()
def
__init__
(
self
,
prefix
):
super
(
L2
,
self
).
__init__
(
prefix
)
self
.
layer1
=
L1
(
self
.
full_name
()
)
self
.
layer2
=
L1
(
self
.
full_name
()
)
def
forward
(
self
):
return
self
.
layer1
()
+
self
.
layer2
()
class
L3
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
L3
,
self
).
__init__
()
self
.
layer1
=
L2
()
self
.
layer2
=
L2
()
def
__init__
(
self
,
prefix
):
super
(
L3
,
self
).
__init__
(
prefix
)
self
.
layer1
=
L2
(
self
.
full_name
()
)
self
.
layer2
=
L2
(
self
.
full_name
()
)
def
forward
(
self
):
return
self
.
layer1
()
+
self
.
layer2
()
...
...
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
class
TestBaseLayer
(
unittest
.
TestCase
):
def
test_one_level
(
self
):
with
fluid
.
imperative
.
guard
():
l
=
L1
()
l
=
L1
(
'test_one_level'
)
ret
=
l
()
self
.
assertEqual
(
l
.
w1
.
name
,
"
MyLayer
_0.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"
MyLayer
_0.w_1"
)
self
.
assertEqual
(
l
.
w1
.
name
,
"
test_one_level/L1_0
_0.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"
test_one_level/L1_0
_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
def
test_three_level
(
self
):
with
fluid
.
imperative
.
guard
():
l
=
L3
()
l
=
L3
(
'test_three_level'
)
names
=
[
p
.
name
for
p
in
l
.
parameters
()]
ret
=
l
()
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0_0.w_0"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0_0.w_1"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1_0.w_0"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1_0.w_1"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0_0.w_0"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
...
...
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
浏览文件 @
c4187dbd
...
...
@@ -56,6 +56,8 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
train_reader
,
multi_devices
=
use_parallel_executor
)
exe
=
fluid
.
Executor
(
place
)
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
exe
.
run
(
fluid
.
default_startup_program
())
train_cp
=
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
...
...
python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
浏览文件 @
c4187dbd
...
...
@@ -35,7 +35,7 @@ class TestFakeQuantizeOp(OpTest):
self
.
check_output
()
class
TestFakeQuantizeOp
(
OpTest
):
class
TestFakeQuantize
RangeAbsMax
Op
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fake_quantize_range_abs_max"
self
.
attrs
=
{
...
...
@@ -43,8 +43,10 @@ class TestFakeQuantizeOp(OpTest):
'window_size'
:
int
(
1
),
'is_test'
:
False
}
x
=
(
np
.
random
.
random
((
8
,
16
,
7
,
7
))
-
0.5
)
*
10
x
=
x
.
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
8
,
16
,
7
,
7
)).
astype
(
"float32"
)
,
'X'
:
x
,
'Iter'
:
np
.
zeros
(
1
).
astype
(
"int64"
),
'InScale'
:
np
.
zeros
(
1
).
astype
(
"float32"
)
}
...
...
@@ -62,5 +64,36 @@ class TestFakeQuantizeOp(OpTest):
self
.
check_output
()
class
TestFakeQuantizeRangeAbsMaxOp2
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fake_quantize_range_abs_max"
self
.
attrs
=
{
'bit_length'
:
int
(
8
),
'window_size'
:
int
(
1
),
'is_test'
:
True
}
x
=
(
np
.
random
.
random
((
8
,
16
,
7
,
7
))
-
0.5
)
*
10
x
=
x
.
astype
(
"float32"
)
scale
=
np
.
max
(
np
.
abs
(
x
)).
astype
(
"float32"
)
-
1.0
out_scales
=
np
.
zeros
(
self
.
attrs
[
'window_size'
]).
astype
(
"float32"
)
out_scales
[
0
]
=
scale
self
.
inputs
=
{
'X'
:
x
,
'Iter'
:
np
.
zeros
(
1
).
astype
(
"int64"
),
'InScale'
:
scale
.
astype
(
"float32"
)
}
xs
=
np
.
clip
(
x
,
-
scale
,
scale
)
qs
=
np
.
round
(
xs
/
scale
*
((
1
<<
(
self
.
attrs
[
'bit_length'
]
-
1
))
-
1
))
self
.
outputs
=
{
'Out'
:
qs
,
'OutScale'
:
scale
.
astype
(
"float32"
),
'OutScales'
:
out_scales
,
}
def
test_check_output
(
self
):
self
.
check_output
(
no_check_set
=
set
([
'OutScale'
,
'OutScales'
]))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_imperative.py
浏览文件 @
c4187dbd
...
...
@@ -15,7 +15,6 @@
import
contextlib
import
unittest
import
numpy
as
np
import
sys
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
...
...
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
class
MyLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
MyLayer
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
):
super
(
MyLayer
,
self
).
__init__
(
name_scope
)
def
forward
(
self
,
inputs
):
x
=
fluid
.
layers
.
relu
(
inputs
)
...
...
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
class
MLP
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
MLP
,
self
).
__init__
()
self
.
_fc1
=
FC
(
3
,
def
__init__
(
self
,
name_scope
):
super
(
MLP
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
,
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc2
=
FC
(
4
,
self
.
_fc2
=
FC
(
self
.
full_name
(),
4
,
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
...
...
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
class
SimpleRNNCell
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
step_input_size
,
hidden_size
,
output_size
,
param_attr
):
super
(
SimpleRNNCell
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
,
step_input_size
,
hidden_size
,
output_size
,
param_attr
):
super
(
SimpleRNNCell
,
self
).
__init__
(
name_scope
)
self
.
step_input_size
=
step_input_size
self
.
hidden_size
=
hidden_size
self
.
output_size
=
output_size
...
...
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
class
SimpleRNN
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
SimpleRNN
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
):
super
(
SimpleRNN
,
self
).
__init__
(
name_scope
)
self
.
seq_len
=
4
self
.
_cell
=
SimpleRNNCell
(
self
.
full_name
(),
3
,
3
,
3
,
...
...
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
cl
=
core
.
Layer
()
cl
.
forward
([])
l
=
fluid
.
imperative
.
Layer
()
l
=
fluid
.
imperative
.
Layer
(
"l"
)
self
.
assertRaises
(
NotImplementedError
,
l
.
forward
,
[])
def
test_pylayer_func_id
(
self
):
...
...
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
np_inp
=
np
.
array
([
1.0
,
2.0
,
-
1.0
],
dtype
=
np
.
float32
)
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
l
=
MyLayer
()
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
var_inp
)[
0
]
self
.
assertIsNotNone
(
x
)
dy_out
=
x
.
_numpy
()
...
...
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
3
],
append_batch_size
=
False
)
l
=
MyLayer
()
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
inp
)[
0
]
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
...
...
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
np_inp
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
mlp
=
MLP
()
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
var_inp
)
dy_out
=
out
.
_numpy
()
out
.
_backward
()
...
...
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
mlp
=
MLP
()
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
...
...
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
params
=
mlp
.
parameters
(
True
)
self
.
assertEqual
(
"
FC
_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"
FC
_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"
FC_1
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"
FC_1
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_0
_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_0
_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_1_0
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"
mlp/MLP_0/FC_1_0
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
len
(
params
),
4
)
sublayers
=
mlp
.
sublayers
(
True
)
...
...
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
layers
.
reshape
(
var_inp
,
shape
=
[
1
,
4
,
3
])
simple_rnn
=
SimpleRNN
()
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
.
forward
(
var_inp
)
dy_out
=
outs
[
3
].
_numpy
()
outs
[
3
].
_backward
()
...
...
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
1
,
4
,
3
],
append_batch_size
=
False
)
simple_rnn
=
SimpleRNN
()
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
outs
[
3
])
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
...
...
python/paddle/fluid/tests/unittests/test_imperative_gan.py
浏览文件 @
c4187dbd
...
...
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable
class
Discriminator
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
Discriminator
,
self
).
__init__
()
self
.
_fc1
=
FC
(
s
ize
=
32
,
act
=
'elu'
,
name
=
"d_fc1"
)
self
.
_fc2
=
FC
(
s
ize
=
1
,
name
=
"d_fc2"
)
def
__init__
(
self
,
name_scope
):
super
(
Discriminator
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
s
elf
.
full_name
(),
size
=
32
,
act
=
'elu'
)
self
.
_fc2
=
FC
(
s
elf
.
full_name
(),
size
=
1
)
def
forward
(
self
,
inputs
):
x
=
self
.
_fc1
(
inputs
)
...
...
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):
class
Generator
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
Generator
,
self
).
__init__
()
self
.
_fc1
=
FC
(
s
ize
=
64
,
act
=
'elu'
,
name
=
"g_fc1"
)
self
.
_fc2
=
FC
(
s
ize
=
64
,
act
=
'elu'
,
name
=
"g_fc2"
)
self
.
_fc3
=
FC
(
s
ize
=
1
,
name
=
"g_fc3"
)
def
__init__
(
self
,
name_scope
):
super
(
Generator
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
s
elf
.
full_name
(),
size
=
64
,
act
=
'elu'
)
self
.
_fc2
=
FC
(
s
elf
.
full_name
(),
size
=
64
,
act
=
'elu'
)
self
.
_fc3
=
FC
(
s
elf
.
full_name
(),
size
=
1
)
def
forward
(
self
,
inputs
):
x
=
self
.
_fc1
(
inputs
)
...
...
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
scope
=
fluid
.
core
.
Scope
()
with
new_program_scope
(
main
=
discriminate_p
,
startup
=
startup
,
scope
=
scope
):
discriminator
=
Discriminator
()
generator
=
Generator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
(
"g"
)
img
=
fluid
.
layers
.
data
(
name
=
"img"
,
shape
=
[
2
,
1
],
append_batch_size
=
False
)
...
...
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
sgd
.
minimize
(
d_loss
)
with
new_program_scope
(
main
=
generate_p
,
startup
=
startup
,
scope
=
scope
):
discriminator
=
Discriminator
()
generator
=
Generator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
(
"g"
)
noise
=
fluid
.
layers
.
data
(
name
=
"noise"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
...
...
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
discriminator
=
Discriminator
()
generator
=
Generator
()
discriminator
=
Discriminator
(
"d"
)
generator
=
Generator
(
"g"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
d_real
=
discriminator
(
to_variable
(
np
.
ones
([
2
,
1
],
np
.
float32
)))
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
c4187dbd
...
...
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope
class
SimpleImgConvPool
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
num_channels
,
num_filters
,
filter_size
,
...
...
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
use_cudnn
=
False
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
SimpleImgConvPool
,
self
).
__init__
()
super
(
SimpleImgConvPool
,
self
).
__init__
(
name_scope
)
self
.
_conv2d
=
Conv2D
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
...
...
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
use_cudnn
=
use_cudnn
)
self
.
_pool2d
=
Pool2D
(
self
.
full_name
(),
pool_size
=
pool_size
,
pool_type
=
pool_type
,
pool_stride
=
pool_stride
,
...
...
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
class
MNIST
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
MNIST
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
MNIST
,
self
).
__init__
(
name_scope
)
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
1
,
20
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
full_name
(),
1
,
20
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
_simple_img_conv_pool_2
=
SimpleImgConvPool
(
20
,
50
,
5
,
2
,
2
,
act
=
"relu"
)
self
.
full_name
(),
20
,
50
,
5
,
2
,
2
,
act
=
"relu"
)
pool_2_shape
=
50
*
4
*
4
SIZE
=
10
scale
=
(
2.0
/
(
pool_2_shape
**
2
*
SIZE
))
**
0.5
self
.
_fc
=
FC
(
10
,
self
.
_fc
=
FC
(
self
.
full_name
(),
10
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
scale
)),
...
...
@@ -101,25 +105,24 @@ class MNIST(fluid.imperative.Layer):
class
TestImperativeMnist
(
unittest
.
TestCase
):
def
test_mnist_float32
(
self
):
seed
=
90
batch_num
=
2
epoch_num
=
1
with
fluid
.
imperative
.
guard
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
mnist
=
MNIST
()
mnist
=
MNIST
(
"mnist"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
)
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
,
drop_last
=
True
)
dy_param_init_value
=
{}
for
epoch
in
range
(
epoch_num
):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
dy_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
128
,
1
)
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
(
[
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
128
,
1
)
img
=
to_variable
(
dy_x_data
)
label
=
to_variable
(
y_data
)
...
...
@@ -128,19 +131,21 @@ class TestImperativeMnist(unittest.TestCase):
cost
=
mnist
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
cost
,
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
dy_out
=
avg_loss
.
_numpy
()
if
batch_id
==
0
:
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
if
epoch
==
0
and
batch_id
==
0
:
for
param
in
mnist
.
parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
_numpy
()
avg_loss
.
_backward
()
sgd
.
minimize
(
avg_loss
)
mnist
.
clear_gradients
()
fluid
.
default_main_program
().
global_block
().
_clear_block
()
dy_param_value
=
{}
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
for
param
in
mnist
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
with
new_program_scope
():
...
...
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
mnist
=
MNIST
()
mnist
=
MNIST
(
"mnist"
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
)
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
128
,
drop_last
=
True
)
img
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
...
...
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
# initialize params and fetch them
static_param_init_value
=
{}
static_param_name_list
=
[]
for
param
in
fluid
.
default_startup_program
().
global_block
(
).
all_parameters
():
for
param
in
mnist
.
parameters
():
static_param_name_list
.
append
(
param
.
name
)
out
=
exe
.
run
(
fluid
.
default_startup_program
(),
...
...
@@ -176,18 +180,18 @@ class TestImperativeMnist(unittest.TestCase):
for
i
in
range
(
len
(
static_param_name_list
)):
static_param_init_value
[
static_param_name_list
[
i
]]
=
out
[
i
]
for
epoch
in
range
(
epoch_num
):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
static_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
[
128
,
1
])
[
x
[
0
].
reshape
(
1
,
28
,
28
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
(
[
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
([
128
,
1
])
fetch_list
=
[
avg_loss
.
name
]
fetch_list
.
extend
(
static_param_name_list
)
out
=
exe
.
run
(
fluid
.
default_main_program
(),
out
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
"pixel"
:
static_x_data
,
"label"
:
y_data
},
fetch_list
=
fetch_list
)
...
...
@@ -195,7 +199,10 @@ class TestImperativeMnist(unittest.TestCase):
static_param_value
=
{}
static_out
=
out
[
0
]
for
i
in
range
(
1
,
len
(
out
)):
static_param_value
[
static_param_name_list
[
i
-
1
]]
=
out
[
i
]
static_param_value
[
static_param_name_list
[
i
-
1
]]
=
out
[
i
]
self
.
assertTrue
(
np
.
allclose
(
dy_x_data
.
all
(),
static_x_data
.
all
()))
for
key
,
value
in
six
.
iteritems
(
static_param_init_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_init_value
[
key
]))
...
...
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]
,
atol
=
1e-5
))
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
浏览文件 @
c4187dbd
...
...
@@ -28,12 +28,13 @@ from paddle.fluid.backward import append_backward
class
SimpleLSTMRNN
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
hidden_size
,
num_steps
,
num_layers
=
2
,
init_scale
=
0.1
,
dropout
=
None
):
super
(
SimpleLSTMRNN
,
self
).
__init__
()
super
(
SimpleLSTMRNN
,
self
).
__init__
(
name_scope
)
self
.
_hidden_size
=
hidden_size
self
.
_num_layers
=
num_layers
self
.
_init_scale
=
init_scale
...
...
@@ -130,13 +131,14 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
class
PtbModel
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
hidden_size
,
vocab_size
,
num_layers
=
2
,
num_steps
=
20
,
init_scale
=
0.1
,
dropout
=
None
):
super
(
PtbModel
,
self
).
__init__
()
super
(
PtbModel
,
self
).
__init__
(
name_scope
)
self
.
hidden_size
=
hidden_size
self
.
vocab_size
=
vocab_size
self
.
init_scale
=
init_scale
...
...
@@ -146,12 +148,14 @@ class PtbModel(fluid.imperative.Layer):
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'PtbModel'
,
act
=
"tanh"
)
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
full_name
(),
hidden_size
,
num_steps
,
num_layers
=
num_layers
,
init_scale
=
init_scale
,
dropout
=
dropout
)
self
.
embedding
=
Embedding
(
self
.
full_name
(),
size
=
[
vocab_size
,
hidden_size
],
dtype
=
'float32'
,
is_sparse
=
False
,
...
...
@@ -226,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
fluid
.
default_main_program
().
random_seed
=
seed
# TODO: marsyang1993 Change seed to
ptb_model
=
PtbModel
(
"ptb_model"
,
hidden_size
=
hidden_size
,
vocab_size
=
vocab_size
,
num_layers
=
num_layers
,
...
...
@@ -265,6 +270,7 @@ class TestImperativePtbRnn(unittest.TestCase):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
ptb_model
=
PtbModel
(
"ptb_model"
,
hidden_size
=
hidden_size
,
vocab_size
=
vocab_size
,
num_layers
=
num_layers
,
...
...
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
浏览文件 @
c4187dbd
...
...
@@ -70,15 +70,17 @@ def optimizer_setting(params):
class
ConvBNLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
num_channels
,
num_filters
,
filter_size
,
stride
=
1
,
groups
=
1
,
act
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
super
(
ConvBNLayer
,
self
).
__init__
(
name_scope
)
self
.
_conv
=
Conv2D
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
...
...
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
act
=
None
,
bias_attr
=
None
)
self
.
_batch_norm
=
BatchNorm
(
num_filters
,
act
=
act
)
self
.
_batch_norm
=
BatchNorm
(
self
.
full_name
(),
num_filters
,
act
=
act
)
def
forward
(
self
,
inputs
):
y
=
self
.
_conv
(
inputs
)
...
...
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
class
BottleneckBlock
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
stride
,
shortcut
=
True
):
super
(
BottleneckBlock
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
,
num_channels
,
num_filters
,
stride
,
shortcut
=
True
):
super
(
BottleneckBlock
,
self
).
__init__
(
name_scope
)
self
.
conv0
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
1
,
act
=
'relu'
)
self
.
conv1
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
3
,
stride
=
stride
,
act
=
'relu'
)
self
.
conv2
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_filters
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
...
...
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
if
not
shortcut
:
self
.
short
=
ConvBNLayer
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
...
...
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
y
=
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv2
)
layer_helper
=
LayerHelper
(
'elementwise_add_activation'
,
act
=
'relu'
)
layer_helper
=
LayerHelper
(
self
.
full_name
()
,
act
=
'relu'
)
return
layer_helper
.
append_activation
(
y
)
class
ResNet
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
layers
=
50
,
class_dim
=
102
):
super
(
ResNet
,
self
).
__init__
()
def
__init__
(
self
,
name_scope
,
layers
=
50
,
class_dim
=
102
):
super
(
ResNet
,
self
).
__init__
(
name_scope
)
self
.
layers
=
layers
supported_layers
=
[
50
,
101
,
152
]
...
...
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
num_filters
=
[
64
,
128
,
256
,
512
]
self
.
conv
=
ConvBNLayer
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
self
.
full_name
(),
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
self
.
pool2d_max
=
Pool2D
(
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
self
.
full_name
(),
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
self
.
bottleneck_block_list
=
[]
num_channels
=
64
...
...
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
bottleneck_block
=
self
.
add_sublayer
(
'bb_%d_%d'
%
(
block
,
i
),
BottleneckBlock
(
self
.
full_name
(),
num_channels
=
num_channels
,
num_filters
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
...
...
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
shortcut
=
True
self
.
pool2d_avg
=
Pool2D
(
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
self
.
full_name
(),
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
import
math
stdv
=
1.0
/
math
.
sqrt
(
2048
*
1.0
)
self
.
out
=
FC
(
size
=
class_dim
,
self
.
out
=
FC
(
self
.
full_name
(),
size
=
class_dim
,
act
=
'softmax'
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
...
...
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
1
batch_num
=
2
with
fluid
.
imperative
.
guard
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
resnet
=
ResNet
()
resnet
=
ResNet
(
"resnet"
)
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
import
random
...
...
@@ -264,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase):
optimizer
.
minimize
(
avg_loss
)
resnet
.
clear_gradients
()
fluid
.
default_main_program
().
global_block
().
_clear_block
()
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
...
...
@@ -275,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
resnet
=
ResNet
()
resnet
=
ResNet
(
"resnet"
)
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
...
...
@@ -297,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase):
static_param_init_value
=
{}
static_param_name_list
=
[]
static_grad_name_list
=
[]
for
param
in
fluid
.
default_startup_program
().
global_block
(
).
all_parameters
():
for
param
in
resnet
.
parameters
():
static_param_name_list
.
append
(
param
.
name
)
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
for
param
in
resnet
.
parameters
():
if
not
param
.
stop_gradient
:
static_grad_name_list
.
append
(
param
.
name
+
core
.
grad_var_suffix
())
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
0 → 100644
浏览文件 @
c4187dbd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
from
__future__
import
print_function
import
paddle.fluid
as
fluid
import
unittest
from
ir_memory_optimize_net_base
import
TestIrMemOptBase
def
lstm_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
30.0
):
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
4
)
lstm_h
,
c
=
fluid
.
layers
.
dynamic_lstm
(
input
=
fc0
,
size
=
hid_dim
*
4
,
is_reverse
=
False
)
lstm_max
=
fluid
.
layers
.
sequence_pool
(
input
=
lstm_h
,
pool_type
=
'max'
)
lstm_max_tanh
=
fluid
.
layers
.
tanh
(
lstm_max
)
fc1
=
fluid
.
layers
.
fc
(
input
=
lstm_max_tanh
,
size
=
hid_dim2
,
act
=
'tanh'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
return
avg_cost
class
TestIrMemOptRNN
(
TestIrMemOptBase
):
def
setUp
(
self
):
self
.
network
=
lstm_net
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
浏览文件 @
c4187dbd
...
...
@@ -13,21 +13,44 @@
# limitations under the License.
import
os
import
sys
import
unittest
from
timeit
import
default_timer
as
timer
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle.dataset.wmt16
as
wmt16
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
os
.
environ
[
'RECORDIO_FILENAME'
]
=
'/tmp/ir_memory_optimize_transformer.wmt16.recordio'
from
test_parallel_executor_transformer
import
TestTransformer
from
test_parallel_executor_transformer
import
transformer
from
test_parallel_executor_transformer
import
transformer
,
ModelHyperParams
,
transformer_model
,
transformer
,
prepare_batch_input
from
parallel_executor_test_base
import
TestParallelExecutorBase
# NOTE(dzhwinter): test diferent strategy colisions.
# open the eager delete tensor strategy by default.
class
TestTransformerWithIR
(
TestTransformer
):
class
TestTransformerWithIR
(
TestParallelExecutorBase
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
batch_size
=
transformer_model
.
batch_size
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
os
.
environ
.
get
(
"RECORDIO_FILENAME"
))
as
writer
:
for
batch
in
reader
():
for
tensor
in
prepare_batch_input
(
batch
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
n_head
):
t
=
fluid
.
LoDTensor
()
t
.
set
(
tensor
,
fluid
.
CPUPlace
())
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
def
test_main
(
self
):
if
core
.
is_compiled_with_cuda
():
# check python transpiler
...
...
@@ -35,13 +58,15 @@ class TestTransformerWithIR(TestTransformer):
transformer
,
use_cuda
=
True
,
memory_opt
=
True
,
use_ir_memory_optimize
=
False
)
use_ir_memory_optimize
=
False
,
iter
=
2
)
# check IR memory optimize
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
,
memory_opt
=
False
,
use_ir_memory_optimize
=
True
)
use_ir_memory_optimize
=
True
,
iter
=
2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
c4187dbd
...
...
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_sampled_softmax_with_cross_entropy
(
self
):
program
=
Program
()
with
program_guard
(
program
):
logits
=
layers
.
data
(
name
=
'Logits'
,
shape
=
[
256
],
dtype
=
'float64'
)
label
=
layers
.
data
(
name
=
'Label'
,
shape
=
[
1
],
dtype
=
'int64'
)
num_samples
=
25
output
=
layers
.
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
@
decorators
.
prog_scope
()
def
test_nce
(
self
):
window_size
=
5
...
...
python/paddle/fluid/tests/unittests/test_lstmp_op.py
浏览文件 @
c4187dbd
...
...
@@ -36,12 +36,14 @@ def lstmp(
w_b
=
None
,
# 1 x 4D
w_c
=
None
,
# 1 x 3D
is_reverse
=
False
,
proj_clip
=
0.0
,
cell_clip
=
0.0
,
act_gate
=
None
,
act_cell
=
None
,
act_cand
=
None
,
act_proj
=
None
):
def
_step
(
x
,
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
act_gate
,
act_cell
,
act_cand
,
act_proj
):
def
_step
(
x
,
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
proj_clip
,
cell_clip
,
act_gate
,
act_
cell
,
act_cand
,
act_
proj
):
g
=
np
.
dot
(
r_pre
,
w_r
)
# 1 x 4D
g
=
g
+
x
g
=
np
.
reshape
(
g
,
(
1
,
g
.
size
))
...
...
@@ -55,6 +57,17 @@ def lstmp(
g_f
=
act_gate
(
g_f
+
w_fc
*
c_pre
)
# 1 x D
c
=
g_f
*
c_pre
+
g_i
*
act_cand
(
c
)
# 1 x D
def
array_clip
(
a
,
clip
):
size
=
np
.
prod
(
a
.
shape
)
new_a
=
np
.
reshape
(
a
,
(
size
))
for
i
in
range
(
size
):
new_a
[
i
]
=
max
(
new_a
[
i
],
-
1.0
*
clip
)
new_a
[
i
]
=
min
(
new_a
[
i
],
clip
)
new_a
=
np
.
reshape
(
new_a
,
a
.
shape
)
return
new_a
if
cell_clip
>
0.0
:
c
=
array_clip
(
c
,
cell_clip
)
if
w_c
is
None
:
g_o
=
act_gate
(
g_o
)
# 1 x D
else
:
...
...
@@ -64,6 +77,8 @@ def lstmp(
# projection
r
=
np
.
dot
(
h
,
w_rh
)
r
=
act_proj
(
r
)
if
proj_clip
>
0.0
:
r
=
array_clip
(
r
,
proj_clip
)
return
r
,
c
def
_reverse
(
x
,
offset
):
...
...
@@ -87,13 +102,13 @@ def lstmp(
# compute one sequence
seq_len
=
lod
[
0
][
i
]
x
=
input
[
offset
[
i
]:
offset
[
i
+
1
],
:]
r_pre
=
np
.
dot
(
h0
[
i
],
w_rh
)
# 1 x P
r_pre
=
act_proj
(
r_pre
)
r_pre
=
h0
[
i
]
c_pre
=
c0
[
i
]
# 1 x D
for
j
in
range
(
seq_len
):
# compute one step
r_pre
,
c_pre
=
_step
(
x
[
j
],
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
act_gate
,
act_cell
,
act_cand
,
act_proj
)
r_pre
,
c_pre
=
_step
(
x
[
j
],
w_r
,
w_rh
,
w_c
,
r_pre
,
c_pre
,
proj_clip
,
cell_clip
,
act_gate
,
act_cell
,
act_cand
,
act_proj
)
projection
.
append
(
r_pre
.
flatten
())
cell
.
append
(
c_pre
.
flatten
())
...
...
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
T
=
sum
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
x
=
np
.
random
.
normal
(
size
=
(
T
,
4
*
self
.
D
)).
astype
(
'float64'
)
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
D
)).
astype
(
'float64'
)
h0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
P
)).
astype
(
'float64'
)
c0
=
np
.
random
.
normal
(
size
=
(
N
,
self
.
D
)).
astype
(
'float64'
)
else
:
h0
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
h0
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
c0
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
w
=
np
.
random
.
normal
(
size
=
(
self
.
P
,
4
*
self
.
D
)).
astype
(
'float64'
)
if
self
.
use_peepholes
:
...
...
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
w_b
=
b
[:,
0
:
4
*
self
.
D
]
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
w_rh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
self
.
P
)).
astype
(
'float64'
)
proj_clip
=
0.1
cell_clip
=
0.1
r
,
c
=
lstmp
(
x
,
self
.
lod
,
h0
,
c0
,
w
,
w_rh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
],
ACTIVATION
[
self
.
act_proj
])
proj_clip
,
cell_clip
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
],
ACTIVATION
[
self
.
act_proj
])
self
.
inputs
=
{
'Input'
:
(
x
,
self
.
lod
),
'Weight'
:
w
,
'ProjWeight'
:
w_rh
}
...
...
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
self
.
attrs
=
{
'use_peepholes'
:
self
.
use_peepholes
,
'is_reverse'
:
self
.
is_reverse
,
'proj_clip'
:
proj_clip
,
'cell_clip'
:
cell_clip
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
,
...
...
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
def
test_check_grad
(
self
):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
)
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
)
class
TestLstmpOpHasInitial
(
TestLstmpOp
):
...
...
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
def
test_check_grad
(
self
):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
,
'C0'
],
[
'Projection'
],
numeric_grad_delta
=
0.0000005
,
max_relative_error
=
1e-2
)
def
test_check_grad_ingore_bias
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'ProjWeight'
,
'Weight'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Bias'
))
def
test_check_grad_ingore_weight
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Weight'
))
def
test_check_grad_ingore_proj_weight
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'ProjWeight'
))
def
test_check_grad_ingore_input
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Weight'
,
'ProjWeight'
,
'Bias'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'Input'
))
def
test_check_grad_ingore_h0
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'C0'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'H0'
))
def
test_check_grad_ingore_c0
(
self
):
N
=
len
(
self
.
lod
[
0
])
self
.
outputs
[
'OrderedP0'
]
=
np
.
zeros
((
N
,
self
.
P
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchHidden'
]
=
np
.
zeros
((
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
...
...
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'ProjWeight'
,
'Bias'
,
'H0'
],
[
'Projection'
],
max_relative_error
=
1e-2
,
numeric_grad_delta
=
0.0000005
,
no_grad_set
=
set
(
'C0'
))
...
...
python/paddle/fluid/tests/unittests/test_softmax_op.py
浏览文件 @
c4187dbd
...
...
@@ -144,15 +144,5 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
return
[
2
,
3
,
4
,
5
]
class
TestSoftmaxMKLDNNOp
(
TestSoftmaxOp
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
class
TestSoftmaxMKLDNNOp2
(
TestSoftmaxMKLDNNOp
):
def
get_x_shape
(
self
):
return
[
2
,
3
,
4
,
5
]
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/utils/plot.py
浏览文件 @
c4187dbd
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
import
os
import
six
class
PlotData
(
object
):
...
...
@@ -72,8 +73,8 @@ class Ploter(object):
plot_curve = Ploter("Curve 1","Curve 2")
plot_curve.append(title="Curve 1",step=1,value=1)
"""
assert
isinstance
(
title
,
basestring
)
assert
self
.
__plot_data__
.
has_key
(
title
)
assert
isinstance
(
title
,
six
.
string_types
)
assert
title
in
self
.
__plot_data__
data
=
self
.
__plot_data__
[
title
]
assert
isinstance
(
data
,
PlotData
)
data
.
append
(
step
,
value
)
...
...
python/paddle/utils/preprocess_img.py
浏览文件 @
c4187dbd
...
...
@@ -122,7 +122,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
def
create_dataset_from_list
(
self
,
path
):
data
=
[]
label_set
=
[]
for
line
in
open
(
file_list
):
for
line
in
open
(
path
):
items
=
line
.
rstrip
.
split
()
image_path
=
items
[
0
]
label_name
=
items
[
1
]
...
...
@@ -141,7 +141,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
path: the path of the image dataset.
"""
if
self
.
from_list
:
return
create_dataset_from_list
(
path
)
return
self
.
create_dataset_from_list
(
path
)
label_set
=
preprocess_util
.
get_label_set_from_dir
(
path
)
data
=
[]
for
l_name
in
list
(
label_set
.
keys
()):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录