Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7e4bd695
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7e4bd695
编写于
11月 27, 2018
作者:
J
JiabinYang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into enhance_hierachical_sigmod_op
上级
b10df8bc
251a1bb0
变更
154
显示空白变更内容
内联
并排
Showing
154 changed file
with
4196 addition
and
1628 deletion
+4196
-1628
.github/ISSUE_TEMPLATE/---feature-request-.md
.github/ISSUE_TEMPLATE/---feature-request-.md
+27
-0
.github/ISSUE_TEMPLATE/---inference-issue-.md
.github/ISSUE_TEMPLATE/---inference-issue-.md
+40
-0
.github/ISSUE_TEMPLATE/---installation-issue-.md
.github/ISSUE_TEMPLATE/---installation-issue-.md
+40
-0
.github/ISSUE_TEMPLATE/---model-issue-.md
.github/ISSUE_TEMPLATE/---model-issue-.md
+36
-0
.github/ISSUE_TEMPLATE/---others-.md
.github/ISSUE_TEMPLATE/---others-.md
+33
-0
.github/ISSUE_TEMPLATE/---training-issue-.md
.github/ISSUE_TEMPLATE/---training-issue-.md
+38
-0
CMakeLists.txt
CMakeLists.txt
+2
-0
Dockerfile
Dockerfile
+41
-0
cmake/cuda.cmake
cmake/cuda.cmake
+4
-1
cmake/external/dlpack.cmake
cmake/external/dlpack.cmake
+31
-0
cmake/external/eigen.cmake
cmake/external/eigen.cmake
+1
-1
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+1
-1
cmake/external/pybind11.cmake
cmake/external/pybind11.cmake
+1
-1
cmake/external/rocprim.cmake
cmake/external/rocprim.cmake
+44
-0
cmake/flags.cmake
cmake/flags.cmake
+3
-0
cmake/generic.cmake
cmake/generic.cmake
+24
-13
cmake/hip.cmake
cmake/hip.cmake
+27
-5
cmake/operators.cmake
cmake/operators.cmake
+2
-1
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-6
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+5
-1
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+2
-2
paddle/fluid/framework/details/all_reduce_op_handle.h
paddle/fluid/framework/details/all_reduce_op_handle.h
+3
-3
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+1
-1
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+3
-3
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+6
-6
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+2
-2
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+2
-2
paddle/fluid/framework/details/data_balance_op_handle.cc
paddle/fluid/framework/details/data_balance_op_handle.cc
+1
-1
paddle/fluid/framework/details/data_balance_op_handle.h
paddle/fluid/framework/details/data_balance_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle.h
paddle/fluid/framework/details/fused_broadcast_op_handle.h
+2
-2
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+2
-2
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+8
-8
paddle/fluid/framework/details/multi_devices_graph_pass.h
paddle/fluid/framework/details/multi_devices_graph_pass.h
+1
-1
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+2
-2
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+6
-6
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+127
-0
paddle/fluid/framework/dlpack_tensor.h
paddle/fluid/framework/dlpack_tensor.h
+45
-0
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+113
-0
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+3
-2
paddle/fluid/framework/ir/is_test_pass_tester.cc
paddle/fluid/framework/ir/is_test_pass_tester.cc
+4
-1
paddle/fluid/framework/lod_tensor.cc
paddle/fluid/framework/lod_tensor.cc
+1
-16
paddle/fluid/framework/lod_tensor_test.cc
paddle/fluid/framework/lod_tensor_test.cc
+0
-2
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+1
-0
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+6
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+21
-39
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+4
-0
paddle/fluid/framework/transfer_scope_cache.cc
paddle/fluid/framework/transfer_scope_cache.cc
+62
-0
paddle/fluid/framework/transfer_scope_cache.h
paddle/fluid/framework/transfer_scope_cache.h
+41
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+2
-1
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+3
-1
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+2
-0
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+27
-12
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+20
-12
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+0
-2
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+1
-4
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+1
-1
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+13
-0
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+2
-10
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+75
-13
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+127
-31
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+7
-2
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+25
-25
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+0
-1
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+1
-0
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+2
-0
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+11
-13
paddle/fluid/inference/tests/book/test_inference_nlp.cc
paddle/fluid/inference/tests/book/test_inference_nlp.cc
+0
-1
paddle/fluid/inference/tests/test_helper.h
paddle/fluid/inference/tests/test_helper.h
+1
-0
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+2
-0
paddle/fluid/inference/utils/benchmark.cc
paddle/fluid/inference/utils/benchmark.cc
+49
-0
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+52
-0
paddle/fluid/inference/utils/benchmark_tester.cc
paddle/fluid/inference/utils/benchmark_tester.cc
+39
-0
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+1
-1
paddle/fluid/operators/beam_search_op_test.cc
paddle/fluid/operators/beam_search_op_test.cc
+2
-2
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+1
-1
paddle/fluid/operators/detection/density_prior_box_op.cc
paddle/fluid/operators/detection/density_prior_box_op.cc
+21
-15
paddle/fluid/operators/detection/density_prior_box_op.cu
paddle/fluid/operators/detection/density_prior_box_op.cu
+170
-0
paddle/fluid/operators/detection/density_prior_box_op.h
paddle/fluid/operators/detection/density_prior_box_op.h
+35
-38
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+6
-1
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+1
-1
paddle/fluid/operators/distributed/grpc_serde.h
paddle/fluid/operators/distributed/grpc_serde.h
+2
-1
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+20
-0
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+3
-1
paddle/fluid/operators/distributed/sendrecvop_utils.h
paddle/fluid/operators/distributed/sendrecvop_utils.h
+1
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+5
-1
paddle/fluid/operators/fused/fusion_gru_op.cc
paddle/fluid/operators/fused/fusion_gru_op.cc
+41
-26
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+46
-27
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
...uid/operators/fused/fusion_transpose_flatten_concat_op.cc
+114
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
.../operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+115
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
...luid/operators/fused/fusion_transpose_flatten_concat_op.h
+50
-0
paddle/fluid/operators/interpolate_op.cc
paddle/fluid/operators/interpolate_op.cc
+18
-9
paddle/fluid/operators/interpolate_op.cu
paddle/fluid/operators/interpolate_op.cu
+8
-2
paddle/fluid/operators/lookup_sparse_table_op.cc
paddle/fluid/operators/lookup_sparse_table_op.cc
+1
-0
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+172
-34
paddle/fluid/operators/math/cpu_vec_test.cc
paddle/fluid/operators/math/cpu_vec_test.cc
+1
-1
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+1
-3
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+1
-1
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+155
-39
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+200
-32
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+7
-19
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+10
-55
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+4
-188
paddle/fluid/operators/math/jit_kernel_impl.h
paddle/fluid/operators/math/jit_kernel_impl.h
+73
-0
paddle/fluid/operators/math/jit_kernel_macro.h
paddle/fluid/operators/math/jit_kernel_macro.h
+4
-4
paddle/fluid/operators/math/jit_kernel_refer.h
paddle/fluid/operators/math/jit_kernel_refer.h
+238
-0
paddle/fluid/operators/math/jit_kernel_rnn.cc
paddle/fluid/operators/math/jit_kernel_rnn.cc
+184
-406
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+62
-136
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+3
-0
paddle/fluid/operators/tensor_array_to_tensor_op.cc
paddle/fluid/operators/tensor_array_to_tensor_op.cc
+5
-5
paddle/fluid/platform/cpu_helper.cc
paddle/fluid/platform/cpu_helper.cc
+1
-1
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+1
-1
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+47
-0
paddle/fluid/platform/dynload/cublas.cc
paddle/fluid/platform/dynload/cublas.cc
+3
-0
paddle/fluid/platform/dynload/cublas.h
paddle/fluid/platform/dynload/cublas.h
+12
-4
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+7
-7
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+30
-1
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+3
-0
paddle/fluid/platform/stream_callback_manager.h
paddle/fluid/platform/stream_callback_manager.h
+1
-1
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-2
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+7
-6
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-3
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+0
-1
paddle/legacy/cuda/include/hl_warpctc_wrap.h
paddle/legacy/cuda/include/hl_warpctc_wrap.h
+2
-1
paddle/legacy/cuda/src/hl_cuda_device.cc
paddle/legacy/cuda/src/hl_cuda_device.cc
+4
-0
paddle/legacy/utils/ThreadLocal.h
paddle/legacy/utils/ThreadLocal.h
+3
-1
paddle/legacy/utils/Util.h
paddle/legacy/utils/Util.h
+27
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+86
-1
paddle/testing/CMakeLists.txt
paddle/testing/CMakeLists.txt
+4
-2
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+6
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+12
-6
python/paddle/fluid/contrib/utils/__init__.py
python/paddle/fluid/contrib/utils/__init__.py
+3
-1
python/paddle/fluid/contrib/utils/lookup_table_utils.py
python/paddle/fluid/contrib/utils/lookup_table_utils.py
+256
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+20
-0
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+40
-11
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+5
-3
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+20
-23
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+27
-23
python/paddle/fluid/metrics.py
python/paddle/fluid/metrics.py
+2
-2
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+8
-1
python/paddle/fluid/tests/book/test_image_classification.py
python/paddle/fluid/tests/book/test_image_classification.py
+1
-1
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+32
-28
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+10
-8
python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
...n/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+17
-148
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
...paddle/fluid/tests/unittests/test_density_prior_box_op.py
+19
-11
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
...ests/unittests/test_fusion_transpose_flatten_concat_op.py
+105
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
...on/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+7
-2
python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
...on/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+197
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+19
-10
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
tools/manylinux1/build_scripts/build_utils.sh
tools/manylinux1/build_scripts/build_utils.sh
+12
-3
未找到文件。
.github/ISSUE_TEMPLATE/---feature-request-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
建议(Feature request)
about
:
您可以提出您的建议。 You could use this template for reporting a suggestion issue.
---
欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献!
在留下您的建议时,辛苦您同步提供如下信息:
-
版本、环境信息
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1
2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号
3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14
-
复现信息:如为报错,请给出复现环境、复现步骤
-
建议描述:请您详细描述,您认为需优化的功能
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
Please make sure that this is a feature request.
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
**To Reproduce**
Steps to reproduce the behavior
**Describe the feature and the current behavior/state.**
**Any Other info.**
.github/ISSUE_TEMPLATE/---inference-issue-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
预测(Inference Issue)
about
:
您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID
2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况
3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号
4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本
-预测信息
1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件
2)CMake包含路径的完整命令
3)API信息(如调用请提供)
4)预测库来源:官网下载/特殊环境(如BCLOUD编译)
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that th
If there is no solution,please make sure that this is an inference issue including the following details :
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Python version
-Cmake orders
-C++version.txt
-API information
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---installation-issue-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
安装(Installation Issue)
about
:
您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
建立issue时,为快速解决问题,请您根据使用情况给出如下信息:
-
标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID
2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况
3)GPU:请提供GPU型号,CUDA和CUDNN版本号
4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本
-
安装方式信息:
1)pip安装/docker安装
2)本地编译:请提供cmake命令,编译命令
3)docker编译:请提供docker镜像,编译命令
特殊环境请注明:如离线安装等
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is an installation issue including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg. Mac OS 10.14)
-Python version
-
Install method: pip install/install with docker/build from source(without docker)/build within docker
-
Other special cases that you think may be related to this problem, eg. offline install, special internet condition
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---model-issue-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
模型(Model Issue)
about
:
您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
algorithm/dataset issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
建立issue时,为快速解决问题,请您根据使用情况给出如下信息:
-
标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错 ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID
2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况
3)GPU:请提供GPU型号,CUDA和CUDNN版本号
4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本
-
模型信息
1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is a issue of models including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Python version
-Name of Models&Dataset/details of operator
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---others-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
其他(Others)
about
:
如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues
---
为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准概括您的问题
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID
2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号
3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14
4)Python版本号
5)显存信息
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
If there is no solution,please provide us with the following details :
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/cuDNN version
-OS Platform and Distribution(eg.Mac OS 10.14)
-Python version
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---training-issue-.md
0 → 100644
浏览文件 @
7e4bd695
---
name
:
训练(Training issue)
about
:
您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
issue.
---
为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID
2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况
3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号
4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本
-
训练信息
1)单机/多机,单卡/多卡
2)显存信息
3)Operator信息
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is a training issue including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Other imformation: Distriuted training/informantion of operator/
Graphics card storage
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -204,7 +204,9 @@ include(external/eigen) # download eigen3
...
@@ -204,7 +204,9 @@ include(external/eigen) # download eigen3
include
(
external/pybind11
)
# download pybind11
include
(
external/pybind11
)
# download pybind11
include
(
external/cares
)
include
(
external/cares
)
include
(
external/cub
)
include
(
external/cub
)
include
(
external/rocprim
)
include
(
external/xxhash
)
# download xxhash
include
(
external/xxhash
)
# download xxhash
include
(
external/dlpack
)
include
(
external/snappy
)
# download snappy
include
(
external/snappy
)
# download snappy
include
(
external/snappystream
)
# download snappystream
include
(
external/snappystream
)
# download snappystream
...
...
Dockerfile
浏览文件 @
7e4bd695
...
@@ -22,6 +22,29 @@ ENV HOME /root
...
@@ -22,6 +22,29 @@ ENV HOME /root
# Add bash enhancements
# Add bash enhancements
COPY
./paddle/scripts/docker/root/ /root/
COPY
./paddle/scripts/docker/root/ /root/
# Prepare packages for Python
RUN
apt-get update
&&
\
apt-get
install
-y
make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
# Install Python3.6
RUN
mkdir
-p
/root/python_build/
&&
wget
-q
https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
&&
\
tar
-zxf
sqlite-autoconf-3250300.tar.gz
&&
cd
sqlite-autoconf-3250300
&&
\
./configure
-prefix
=
/usr/local
&&
make
-j8
&&
make
install
&&
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
&&
\
wget
-q
https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz
&&
\
tar
-xzf
Python-3.6.0.tgz
&&
cd
Python-3.6.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
# Install Python3.7
RUN
wget
-q
https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz
&&
\
tar
-xzf
Python-3.7.0.tgz
&&
cd
Python-3.7.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
RUN
rm
-r
/root/python_build
RUN
apt-get update
&&
\
RUN
apt-get update
&&
\
apt-get
install
-y
--allow-downgrades
patchelf
\
apt-get
install
-y
--allow-downgrades
patchelf
\
python3 python3-dev python3-pip
\
python3 python3-dev python3-pip
\
...
@@ -74,6 +97,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
...
@@ -74,6 +97,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN
pip3
install
-U
wheel
&&
\
RUN
pip3
install
-U
wheel
&&
\
pip3
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.6
install
-U
wheel
&&
\
pip3.6
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.6
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.7
install
-U
wheel
&&
\
pip3.7
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.7
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
easy_install
-U
pip
&&
\
easy_install
-U
pip
&&
\
pip
install
-U
pip setuptools wheel
&&
\
pip
install
-U
pip setuptools wheel
&&
\
pip
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
...
@@ -82,22 +111,34 @@ RUN pip3 install -U wheel && \
...
@@ -82,22 +111,34 @@ RUN pip3 install -U wheel && \
RUN
pip3
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
RUN
pip3
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3
install
opencv-python
&&
\
pip3
install
opencv-python
&&
\
pip3.6
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.6
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.6
install
opencv-python
&&
\
pip3.7
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.7
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.7
install
opencv-python
&&
\
pip
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip
install
opencv-python
pip
install
opencv-python
#For docstring checker
#For docstring checker
RUN
pip3
install
pylint pytest astroid isort
RUN
pip3
install
pylint pytest astroid isort
RUN
pip3.6
install
pylint pytest astroid isort
RUN
pip3.7
install
pylint pytest astroid isort
RUN
pip
install
pylint pytest astroid isort LinkChecker
RUN
pip
install
pylint pytest astroid isort LinkChecker
COPY
./python/requirements.txt /root/
COPY
./python/requirements.txt /root/
RUN
pip3
install
-r
/root/requirements.txt
RUN
pip3
install
-r
/root/requirements.txt
RUN
pip3.6
install
-r
/root/requirements.txt
RUN
pip3.7
install
-r
/root/requirements.txt
RUN
pip
install
-r
/root/requirements.txt
RUN
pip
install
-r
/root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
pip3
install
certifi urllib3[secure]
RUN
pip3
install
certifi urllib3[secure]
RUN
pip3.6
install
certifi urllib3[secure]
RUN
pip3.7
install
certifi urllib3[secure]
RUN
pip
install
certifi urllib3[secure]
RUN
pip
install
certifi urllib3[secure]
...
...
cmake/cuda.cmake
浏览文件 @
7e4bd695
...
@@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
...
@@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_RELEASE
}
)
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_RELEASE
}
)
endif
()
endif
()
else
(
NOT WIN32
)
else
(
NOT WIN32
)
list
(
APPEND CUDA_NVCC_FLAGS
"--compiler-options;/bigobj"
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-g -G"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-g -G"
)
# match the cl's _ITERATOR_DEBUG_LEVEL
list
(
APPEND CUDA_NVCC_FLAGS
"-D_DEBUG"
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"Release"
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"Release"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-O3 -DNDEBUG"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-O3 -DNDEBUG"
)
else
()
else
()
...
...
cmake/external/dlpack.cmake
0 → 100644
浏览文件 @
7e4bd695
include
(
ExternalProject
)
set
(
DLPACK_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/dlpack
)
set
(
DLPACK_INCLUDE_DIR
${
DLPACK_SOURCE_DIR
}
/src/extern_dlpack/include
)
include_directories
(
${
DLPACK_INCLUDE_DIR
}
)
ExternalProject_Add
(
extern_dlpack
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/dmlc/dlpack.git"
GIT_TAG
"v0.2"
PREFIX
${
DLPACK_SOURCE_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
BUILD_COMMAND
""
INSTALL_COMMAND
""
TEST_COMMAND
""
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.3.0"
)
set
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/dlpack_dummy.c
)
file
(
WRITE
${
dummyfile
}
"const char *dummy =
\"
${
dummyfile
}
\"
;"
)
add_library
(
dlpack STATIC
${
dummyfile
}
)
else
()
add_library
(
dlpack INTERFACE
)
endif
()
add_dependencies
(
dlpack extern_dlpack
)
LIST
(
APPEND externl_project_dependencies dlpack
)
cmake/external/eigen.cmake
浏览文件 @
7e4bd695
...
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
...
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
extern_eigen3
extern_eigen3
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/sabreshao/hipeigen.git"
GIT_REPOSITORY
"https://github.com/sabreshao/hipeigen.git"
GIT_TAG
0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
GIT_TAG
7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
PREFIX
${
EIGEN_SOURCE_DIR
}
PREFIX
${
EIGEN_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
cmake/external/mkldnn.cmake
浏览文件 @
7e4bd695
...
@@ -53,7 +53,7 @@ ExternalProject_Add(
...
@@ -53,7 +53,7 @@ ExternalProject_Add(
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLDNN_DEPENDS
}
DEPENDS
${
MKLDNN_DEPENDS
}
GIT_REPOSITORY
"https://github.com/01org/mkl-dnn.git"
GIT_REPOSITORY
"https://github.com/01org/mkl-dnn.git"
GIT_TAG
"
21fb5f2af1dd14e132af4f1b79160977ee487818
"
GIT_TAG
"
830a10059a018cd2634d94195140cf2d8790a75a
"
PREFIX
${
MKLDNN_SOURCES_DIR
}
PREFIX
${
MKLDNN_SOURCES_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
cmake/external/pybind11.cmake
浏览文件 @
7e4bd695
...
@@ -26,7 +26,7 @@ ExternalProject_Add(
...
@@ -26,7 +26,7 @@ ExternalProject_Add(
extern_pybind
extern_pybind
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/pybind/pybind11.git"
GIT_REPOSITORY
"https://github.com/pybind/pybind11.git"
GIT_TAG
"v2.
1.1
"
GIT_TAG
"v2.
2.4
"
PREFIX
${
PYBIND_SOURCE_DIR
}
PREFIX
${
PYBIND_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
cmake/external/rocprim.cmake
0 → 100644
浏览文件 @
7e4bd695
if
(
NOT WITH_AMD_GPU
)
return
()
endif
()
# rocprim is "ROCm Parallel Primitives" for short.
# It is a header-only library providing HIP and HC parallel primitives
# for developing performant GPU-accelerated code on AMD ROCm platform.
if
(
"x
${
HCC_HOME
}
"
STREQUAL
"x"
)
set
(
HCC_HOME
"/opt/rocm/hcc"
)
endif
()
INCLUDE
(
ExternalProject
)
SET
(
ROCPRIM_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/rocprim
)
SET
(
ROCPRIM_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/rocprim
)
SET
(
ROCPRIM_INCLUDE_DIR
${
ROCPRIM_INSTALL_DIR
}
/include
)
ExternalProject_Add
(
extern_rocprim
GIT_REPOSITORY
"https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc
PREFIX
${
ROCPRIM_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
HCC_HOME
}
/bin/hcc
CMAKE_ARGS -DONLY_INSTALL=ON
CMAKE_ARGS -DBUILD_TEST=OFF
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
ROCPRIM_INSTALL_DIR
}
INSTALL_DIR
${
ROCPRIM_INSTALL_DIR
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
)
INCLUDE_DIRECTORIES
(
${
ROCPRIM_INCLUDE_DIR
}
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.3.0"
)
set
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/rocprim_dummy.c
)
file
(
WRITE
${
dummyfile
}
"const char *dummy_rocprim =
\"
${
dummyfile
}
\"
;"
)
add_library
(
rocprim STATIC
${
dummyfile
}
)
else
()
add_library
(
rocprim INTERFACE
)
endif
()
add_dependencies
(
rocprim extern_rocprim
)
cmake/flags.cmake
浏览文件 @
7e4bd695
...
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
...
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
-Wno-error=parentheses-equality
# Warnings in pybind11
-Wno-error=parentheses-equality
# Warnings in pybind11
-Wno-error=ignored-attributes
# Warnings in Eigen, gcc 6.3
-Wno-error=ignored-attributes
# Warnings in Eigen, gcc 6.3
-Wno-error=terminate
# Warning in PADDLE_ENFORCE
-Wno-error=terminate
# Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context
# Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0
# Warning in tinyformat.h
-Wno-error=maybe-uninitialized
# Warning in boost gcc 7.2
)
)
set
(
GPU_COMMON_FLAGS
set
(
GPU_COMMON_FLAGS
...
...
cmake/generic.cmake
浏览文件 @
7e4bd695
...
@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
...
@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS ARGS
)
set
(
multiValueArgs SRCS DEPS ARGS
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
WIN32
)
list
(
APPEND win32_deps shlwapi
)
if
(
"
${
cc_test_DEPS
}
;"
MATCHES
"python;"
)
list
(
REMOVE_ITEM cc_test_DEPS python
)
list
(
APPEND win32_deps
${
PYTHON_LIBRARIES
}
)
endif
()
endif
(
WIN32
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
if
(
WIN32
)
if
(
WIN32
)
target_link_libraries
(
${
TARGET_NAME
}
shlwapi
)
target_link_libraries
(
${
TARGET_NAME
}
${
win32_deps
}
)
endif
(
WIN32
)
endif
(
WIN32
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_test
(
NAME
${
TARGET_NAME
}
add_test
(
NAME
${
TARGET_NAME
}
...
@@ -454,11 +461,15 @@ function(hip_library TARGET_NAME)
...
@@ -454,11 +461,15 @@ function(hip_library TARGET_NAME)
else
()
else
()
add_library
(
${
TARGET_NAME
}
STATIC
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
add_library
(
${
TARGET_NAME
}
STATIC
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE CXX
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE CXX
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
/opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so
)
find_fluid_modules
(
${
TARGET_NAME
}
)
find_fluid_modules
(
${
TARGET_NAME
}
)
endif
()
endif
()
if
(
hip_library_DEPS
)
if
(
"
${
hip_library_DEPS
}
"
MATCHES
"ARCHIVE_START"
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
list
(
REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END
)
else
()
target_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
endif
()
endif
()
# cpplint code style
# cpplint code style
...
@@ -679,7 +690,7 @@ function(py_test TARGET_NAME)
...
@@ -679,7 +690,7 @@ function(py_test TARGET_NAME)
set
(
multiValueArgs SRCS DEPS ARGS ENVS
)
set
(
multiValueArgs SRCS DEPS ARGS ENVS
)
cmake_parse_arguments
(
py_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
py_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_test
(
NAME
${
TARGET_NAME
}
add_test
(
NAME
${
TARGET_NAME
}
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
COMMAND
${
CMAKE_COMMAND
}
-E
env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
FLAGS_cpu_deterministic=true
PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
${
py_test_ENVS
}
PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
${
py_test_ENVS
}
${
PYTHON_EXECUTABLE
}
-u
${
py_test_SRCS
}
${
py_test_ARGS
}
${
PYTHON_EXECUTABLE
}
-u
${
py_test_SRCS
}
${
py_test_ARGS
}
...
...
cmake/hip.cmake
浏览文件 @
7e4bd695
...
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
...
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
endif
()
endif
()
include_directories
(
"/opt/rocm/include"
)
include_directories
(
"/opt/rocm/include"
)
include_directories
(
"/opt/rocm/hip/include"
)
include_directories
(
"/opt/rocm/miopen/include"
)
include_directories
(
"/opt/rocm/hipblas/include"
)
include_directories
(
"/opt/rocm/hipblas/include"
)
include_directories
(
"/opt/rocm/hiprand/include"
)
include_directories
(
"/opt/rocm/hiprand/include"
)
include_directories
(
"/opt/rocm/rocrand/include"
)
include_directories
(
"/opt/rocm/rocrand/include"
)
...
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
...
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
list
(
APPEND EXTERNAL_LIBS
"-L/opt/rocm/lib/ -lhip_hcc"
)
list
(
APPEND EXTERNAL_LIBS
"-L/opt/rocm/lib/ -lhip_hcc"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
4
"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
1
"
)
if
(
WITH_DSO
)
if
(
WITH_DSO
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_USE_DSO"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_USE_DSO"
)
endif
(
WITH_DSO
)
endif
(
WITH_DSO
)
if
(
WITH_DOUBLE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_TYPE_DOUBLE"
)
endif
(
WITH_DOUBLE
)
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_TESTING"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_TESTING"
)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
if
(
WITH_DISTRIBUTE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_DISTRIBUTE"
)
endif
(
WITH_DISTRIBUTE
)
if
(
WITH_GRPC
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_GRPC"
)
endif
(
WITH_GRPC
)
if
(
NOT WITH_GOLANG
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITHOUT_GOLANG"
)
endif
(
NOT WITH_GOLANG
)
if
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_MKLDNN"
)
endif
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DANY_IMPL_ANY_CAST_MOVEABLE"
)
if
(
NOT WITH_RDMA
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_DISABLE_RDMA"
)
endif
(
NOT WITH_RDMA
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
list
(
APPEND HIP_HCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
list
(
APPEND HIP_HCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"RelWithDebInfo"
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"RelWithDebInfo"
)
...
...
cmake/operators.cmake
浏览文件 @
7e4bd695
...
@@ -109,7 +109,8 @@ function(op_library TARGET)
...
@@ -109,7 +109,8 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"compare_op"
"logical_op"
"nccl_op"
foreach
(
manual_pybind_op
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
set
(
pybind_flag 1
)
endif
()
endif
()
...
...
paddle/fluid/API.spec
浏览文件 @
7e4bd695
...
@@ -26,10 +26,10 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
...
@@ -26,10 +26,10 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.
ParallelExecutor.
ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.
ParallelExecutor.BuildStrategy.
GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.
ParallelExecutor.BuildStrategy.
ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.
ParallelExecutor.
BuildStrategy) -> None
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
...
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
...
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5
, None))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False
, None))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
...
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
...
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'
))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
, 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None
))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -116,8 +116,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
...
@@ -116,8 +116,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library
(
op_info SRCS op_info.cc DEPS attribute framework_proto
)
cc_library
(
op_info SRCS op_info.cc DEPS attribute framework_proto
)
cc_library
(
shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context
)
cc_library
(
shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context
)
cc_library
(
transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context
)
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler
)
shape_inference data_transform lod_tensor profiler
transfer_scope_cache
)
cc_test
(
operator_test SRCS operator_test.cc DEPS operator op_registry device_context
)
cc_test
(
operator_test SRCS operator_test.cc DEPS operator op_registry device_context
)
...
@@ -192,3 +193,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
...
@@ -192,3 +193,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_test
(
rw_lock_test SRCS rw_lock_test.cc
)
cc_test
(
rw_lock_test SRCS rw_lock_test.cc
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
cc_library
(
dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack
)
cc_test
(
dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog
)
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
7e4bd695
...
@@ -23,7 +23,7 @@ namespace paddle {
...
@@ -23,7 +23,7 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
...
@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
}
}
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
PADDLE_ENFORCE
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
int
dtype
=
-
1
;
int
dtype
=
-
1
;
size_t
numel
=
0
;
size_t
numel
=
0
;
...
...
paddle/fluid/framework/details/all_reduce_op_handle.h
浏览文件 @
7e4bd695
...
@@ -20,7 +20,7 @@
...
@@ -20,7 +20,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -29,7 +29,7 @@ namespace framework {
...
@@ -29,7 +29,7 @@ namespace framework {
namespace
details
{
namespace
details
{
struct
AllReduceOpHandle
:
public
OpHandleBase
{
struct
AllReduceOpHandle
:
public
OpHandleBase
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
);
const
platform
::
NCCLContextMap
*
ctxs
);
...
@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
...
@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
private:
private:
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
#endif
};
};
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
7e4bd695
...
@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
});
}
}
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
VarHandle
*
out_handle
=
nullptr
;
VarHandle
*
out_handle
=
nullptr
;
int
root_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
in_tensor
.
place
()).
device
;
int
root_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
in_tensor
.
place
()).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
broadcast_calls
;
std
::
vector
<
std
::
function
<
void
()
>>
broadcast_calls
;
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
7e4bd695
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -34,7 +34,7 @@ namespace details {
...
@@ -34,7 +34,7 @@ namespace details {
struct
BroadcastOpHandle
:
public
OpHandleBase
{
struct
BroadcastOpHandle
:
public
OpHandleBase
{
public:
public:
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
platform
::
NCCLContextMap
*
nccl_ctxs
)
...
@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {
...
@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
#endif
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
7e4bd695
...
@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
...
@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
vector
<
p
::
Place
>
place_list_
;
std
::
vector
<
p
::
Place
>
place_list_
;
bool
use_gpu_
;
bool
use_gpu_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
#endif
...
@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
...
@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
ctxs_
[
j
]
->
Wait
();
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
nccl_ctxs_
)
{
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
nccl_ctxs_
->
WaitAll
();
}
}
...
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
...
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
void
InitCtxOnGpu
(
bool
use_gpu
)
{
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
if
(
use_gpu_
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
int
count
=
p
::
GetCUDADeviceCount
();
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
...
@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
place_list_
.
push_back
(
p
);
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_
.
reset
(
nullptr
);
nccl_ctxs_
.
reset
(
nullptr
);
#endif
#endif
}
}
...
@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
...
@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
nodes_
.
emplace_back
(
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
));
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_gpu_
)
{
if
(
use_gpu_
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
place_list_
,
nccl_ctxs_
.
get
());
#else
#else
PADDLE_THROW
(
"CUDA is not support."
);
PADDLE_THROW
(
"CUDA is not support."
);
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
place_list_
,
nccl_ctxs_
.
get
());
#else
#else
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
7e4bd695
...
@@ -96,7 +96,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -96,7 +96,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const
std
::
string
&
loss_var_name
,
const
std
::
string
&
loss_var_name
,
const
std
::
unordered_set
<
std
::
string
>
&
param_names
,
const
std
::
unordered_set
<
std
::
string
>
&
param_names
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
{
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
{
#else
#else
const
bool
use_cuda
)
const
{
const
bool
use_cuda
)
const
{
...
@@ -118,7 +118,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -118,7 +118,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
Erase
(
"local_scopes"
);
pass
->
Erase
(
"local_scopes"
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
"local_scopes"
,
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
"local_scopes"
,
&
local_scopes
);
&
local_scopes
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
7e4bd695
...
@@ -23,7 +23,7 @@
...
@@ -23,7 +23,7 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -98,7 +98,7 @@ struct BuildStrategy {
...
@@ -98,7 +98,7 @@ struct BuildStrategy {
const
std
::
string
&
loss_var_name
,
const
std
::
string
&
loss_var_name
,
const
std
::
unordered_set
<
std
::
string
>
&
param_names
,
const
std
::
unordered_set
<
std
::
string
>
&
param_names
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
;
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
;
#else
#else
const
bool
use_cuda
)
const
;
const
bool
use_cuda
)
const
;
...
...
paddle/fluid/framework/details/data_balance_op_handle.cc
浏览文件 @
7e4bd695
...
@@ -20,7 +20,7 @@ namespace paddle {
...
@@ -20,7 +20,7 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle
::
DataBalanceOpHandle
(
DataBalanceOpHandle
::
DataBalanceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
paddle/fluid/framework/details/data_balance_op_handle.h
浏览文件 @
7e4bd695
...
@@ -19,7 +19,7 @@
...
@@ -19,7 +19,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -29,7 +29,7 @@ namespace details {
...
@@ -29,7 +29,7 @@ namespace details {
struct
DataBalanceOpHandle
:
public
OpHandleBase
{
struct
DataBalanceOpHandle
:
public
OpHandleBase
{
public:
public:
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
DataBalanceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
);
const
platform
::
NCCLContextMap
*
ctxs
);
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle.h
浏览文件 @
7e4bd695
...
@@ -25,7 +25,7 @@
...
@@ -25,7 +25,7 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -35,7 +35,7 @@ namespace details {
...
@@ -35,7 +35,7 @@ namespace details {
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
public:
public:
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
7e4bd695
...
@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
...
@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_
.
emplace_back
(
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_gpu_
)
{
if
(
use_gpu_
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
=
new
FusedBroadcastOpHandle
(
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
#else
PADDLE_THROW
(
"CUDA is not supported."
);
PADDLE_THROW
(
"CUDA is not supported."
);
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
=
new
FusedBroadcastOpHandle
(
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
#else
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
7e4bd695
...
@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
...
@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
places_
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
);
places_
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
);
local_scopes_
=
Get
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
);
local_scopes_
=
Get
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
);
strategy_
=
Get
<
const
BuildStrategy
>
(
kStrategy
);
strategy_
=
Get
<
const
BuildStrategy
>
(
kStrategy
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_
=
&
Get
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
);
nccl_ctxs_
=
&
Get
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
);
#endif
#endif
...
@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
...
@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
}
}
}
}
bool
use_gpu
=
false
;
bool
use_gpu
=
false
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu
=
nccl_ctxs_
!=
nullptr
;
use_gpu
=
nccl_ctxs_
!=
nullptr
;
#endif
#endif
...
@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
...
@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
void
MultiDevSSAGraphBuilder
::
SetCommunicationContext
(
void
MultiDevSSAGraphBuilder
::
SetCommunicationContext
(
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
{
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
nccl_ctxs_
==
nullptr
)
{
if
(
nccl_ctxs_
==
nullptr
)
{
op_handle
->
SetDeviceContext
(
p
,
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
...
@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
...
@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
ir
::
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
{
size_t
src_dev_id
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
op_handle
=
new
BroadcastOpHandle
(
auto
*
op_handle
=
new
BroadcastOpHandle
(
result
->
CreateEmptyNode
(
"broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
result
->
CreateEmptyNode
(
"broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
);
local_scopes_
,
places_
,
nccl_ctxs_
);
...
@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
...
@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
void
MultiDevSSAGraphBuilder
::
CreateFusedBroadcastOp
(
void
MultiDevSSAGraphBuilder
::
CreateFusedBroadcastOp
(
ir
::
Graph
*
result
,
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
{
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
op_handle
=
new
FusedBroadcastOpHandle
(
auto
*
op_handle
=
new
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
);
local_scopes_
,
places_
,
nccl_ctxs_
);
...
@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
...
@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
void
MultiDevSSAGraphBuilder
::
InsertAllReduceOp
(
ir
::
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
InsertAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
{
const
std
::
string
&
og
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
AllReduceOpHandle
(
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
));
local_scopes_
,
places_
,
nccl_ctxs_
));
...
@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
...
@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
void
MultiDevSSAGraphBuilder
::
InsertDataBalanceOp
(
void
MultiDevSSAGraphBuilder
::
InsertDataBalanceOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
{
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
DataBalanceOpHandle
(
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
DataBalanceOpHandle
(
result
->
CreateEmptyNode
(
"data_balance"
,
ir
::
Node
::
Type
::
kOperation
),
result
->
CreateEmptyNode
(
"data_balance"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
));
local_scopes_
,
places_
,
nccl_ctxs_
));
...
@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
...
@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
VarHandle
*
MultiDevSSAGraphBuilder
::
CreateReduceOp
(
ir
::
Graph
*
result
,
VarHandle
*
MultiDevSSAGraphBuilder
::
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
{
int
dst_dev_id
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
ReduceOpHandle
(
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
ReduceOpHandle
(
result
->
CreateEmptyNode
(
"reduce"
,
ir
::
Node
::
Type
::
kOperation
),
result
->
CreateEmptyNode
(
"reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
));
local_scopes_
,
places_
,
nccl_ctxs_
));
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.h
浏览文件 @
7e4bd695
...
@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
...
@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t
device_id
)
const
;
size_t
device_id
)
const
;
void
Init
()
const
;
void
Init
()
const
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
;
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
#endif
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
7e4bd695
...
@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
...
@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
}
}
});
});
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
7e4bd695
...
@@ -23,7 +23,7 @@
...
@@ -23,7 +23,7 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#endif
...
@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
...
@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
7e4bd695
...
@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
...
@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
#endif
...
@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
...
@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
ctxs_
[
j
]
->
Wait
();
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
nccl_ctxs_
)
{
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
nccl_ctxs_
->
WaitAll
();
}
}
...
@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
...
@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
void
InitCtxOnGpu
(
bool
use_gpu
)
{
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
use_gpu_
=
use_gpu
;
if
(
use_gpu
)
{
if
(
use_gpu
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
int
count
=
p
::
GetCUDADeviceCount
();
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
...
@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
gpu_list_
.
push_back
(
p
);
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_
.
reset
(
nullptr
);
nccl_ctxs_
.
reset
(
nullptr
);
#endif
#endif
}
}
...
@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
...
@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
nodes
.
emplace_back
(
new
ir
::
Node
(
"node"
));
nodes
.
emplace_back
(
new
ir
::
Node
(
"node"
));
if
(
use_gpu_
)
{
if
(
use_gpu_
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
#else
PADDLE_THROW
(
"CUDA is not support."
);
PADDLE_THROW
(
"CUDA is not support."
);
#endif
#endif
}
else
{
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
op_handle_
.
reset
(
new
ReduceOpHandle
(
nodes
.
back
().
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
#else
...
...
paddle/fluid/framework/dlpack_tensor.cc
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
internal
{
template
<
typename
T
>
static
::
DLDataType
GetDLDataTypeCode
()
{
::
DLDataType
dtype
;
if
(
std
::
is_same
<
T
,
platform
::
float16
>::
value
||
std
::
is_floating_point
<
T
>::
value
)
{
dtype
.
code
=
kDLFloat
;
}
else
if
(
std
::
is_unsigned
<
T
>::
value
)
{
dtype
.
code
=
kDLUInt
;
}
else
if
(
std
::
is_integral
<
T
>::
value
)
{
dtype
.
code
=
kDLInt
;
}
else
{
PADDLE_THROW
(
"Unsupported data type %s"
,
typeid
(
T
).
name
());
}
dtype
.
bits
=
8
*
sizeof
(
T
);
dtype
.
lanes
=
1
;
return
dtype
;
}
static
DLDataType
GetDLDataTypeFromTypeIndex
(
const
std
::
type_index
&
type
)
{
#define REG_DL_DATA_TYPE(type) \
{ std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
static
const
std
::
unordered_map
<
std
::
type_index
,
::
DLDataType
>
type_to_dtype_map
({
REG_DL_DATA_TYPE
(
platform
::
float16
),
// NOLINT
REG_DL_DATA_TYPE
(
float
),
// NOLINT
REG_DL_DATA_TYPE
(
double
),
// NOLINT
REG_DL_DATA_TYPE
(
int
),
// NOLINT
REG_DL_DATA_TYPE
(
int64_t
),
// NOLINT
REG_DL_DATA_TYPE
(
bool
),
// NOLINT
REG_DL_DATA_TYPE
(
size_t
),
// NOLINT
REG_DL_DATA_TYPE
(
int16_t
),
// NOLINT
REG_DL_DATA_TYPE
(
uint8_t
),
// NOLINT
REG_DL_DATA_TYPE
(
int8_t
)
// NOLINT
});
static
auto
type_to_dtype_map_end_it
=
type_to_dtype_map
.
end
();
auto
it
=
type_to_dtype_map
.
find
(
type
);
PADDLE_ENFORCE
(
it
!=
type_to_dtype_map_end_it
,
"Unsupported data type %s"
,
type
.
name
());
return
it
->
second
;
#undef REG_DL_DATA_TYPE
}
struct
DLContextVisitor
:
public
boost
::
static_visitor
<::
DLContext
>
{
inline
::
DLContext
operator
()(
const
platform
::
CPUPlace
&
place
)
const
{
DLContext
ctx
;
ctx
.
device_type
=
kDLCPU
;
ctx
.
device_id
=
0
;
return
ctx
;
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#ifdef PADDLE_WITH_CUDA
DLContext
ctx
;
ctx
.
device_type
=
kDLGPU
;
ctx
.
device_id
=
place
.
device
;
return
ctx
;
#else
PADDLE_THROW
(
"platform::CUDAPlace is not supported in CPU only version"
);
#endif
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
const
{
#ifdef PADDLE_WITH_CUDA
DLContext
ctx
;
ctx
.
device_type
=
kDLCPUPinned
;
ctx
.
device_id
=
0
;
return
ctx
;
#else
PADDLE_THROW
(
"platform::CUDAPinnedPlace is not supported in CPU only version"
);
#endif
}
};
}
// namespace internal
DLPackTensor
::
DLPackTensor
(
const
Tensor
&
tensor
,
LaneType
lanes
)
{
// init data, data buffer
t_
.
data
=
const_cast
<
void
*>
(
tensor
.
data
<
void
>
());
// init ctx, DLContext type with device_type and device_id
auto
place
=
tensor
.
place
();
t_
.
ctx
=
boost
::
apply_visitor
(
internal
::
DLContextVisitor
(),
place
);
// init dtype
t_
.
dtype
=
internal
::
GetDLDataTypeFromTypeIndex
(
tensor
.
type
());
t_
.
dtype
.
lanes
=
lanes
;
// init ndim, tensor rank
auto
&
dims
=
tensor
.
dims
();
using
DimType
=
decltype
(
t_
.
ndim
);
// int
t_
.
ndim
=
static_cast
<
DimType
>
(
dims
.
size
());
// init shape, tensor dims
t_
.
shape
=
shape_
;
for
(
DimType
i
=
0
;
i
<
t_
.
ndim
;
++
i
)
{
t_
.
shape
[
i
]
=
dims
[
i
];
}
// init strides, nullptr means the tensor is compact
t_
.
strides
=
nullptr
;
// init byte_offset
t_
.
byte_offset
=
0
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/dlpack_tensor.h
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <dlpack/dlpack.h>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
framework
{
class
DLPackTensor
{
public:
using
LaneType
=
decltype
(
::
DLTensor
::
dtype
.
lanes
);
// uint16_t
using
ShapeType
=
std
::
remove_reference
<
decltype
(
::
DLTensor
::
shape
[
0
])
>::
type
;
// int64_t
// lanes is only used in CPU to enable vectorization
explicit
DLPackTensor
(
const
Tensor
&
tensor
,
LaneType
lanes
=
1
);
inline
operator
const
::
DLTensor
&
()
const
{
return
t_
;
}
inline
operator
::
DLTensor
&
()
{
return
t_
;
}
private:
::
DLTensor
t_
;
// The shape in DLTensor is defined as int64_t*
// Add this member to make TVMTensor init without heap allocation
ShapeType
shape_
[
9
];
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/dlpack_tensor_test.cc
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <vector>
namespace
paddle
{
namespace
framework
{
namespace
{
// NOLINT
template
<
typename
T
>
constexpr
uint8_t
GetDLDataTypeCode
()
{
return
std
::
is_same
<
platform
::
float16
,
T
>::
value
||
std
::
is_floating_point
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLFloat
)
:
(
std
::
is_unsigned
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLUInt
)
:
(
std
::
is_integral
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLInt
)
:
static_cast
<
uint8_t
>
(
-
1
)));
}
}
// NOLINT
template
<
typename
T
>
void
TestMain
(
const
platform
::
Place
&
place
,
uint16_t
lanes
)
{
DDim
dims
{
4
,
5
,
6
,
7
};
Tensor
tensor
;
tensor
.
Resize
(
dims
);
void
*
p
=
tensor
.
mutable_data
<
T
>
(
place
);
DLPackTensor
dlpack_tensor
(
tensor
,
lanes
);
::
DLTensor
&
dl_tensor
=
dlpack_tensor
;
CHECK_EQ
(
p
,
dl_tensor
.
data
);
if
(
platform
::
is_cpu_place
(
place
))
{
CHECK_EQ
(
kDLCPU
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
0
,
dl_tensor
.
ctx
.
device_id
);
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
CHECK_EQ
(
kDLGPU
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
,
dl_tensor
.
ctx
.
device_id
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
CHECK_EQ
(
kDLCPUPinned
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
0
,
dl_tensor
.
ctx
.
device_id
);
}
else
{
CHECK_EQ
(
false
,
true
);
}
CHECK_EQ
(
dims
.
size
(),
dl_tensor
.
ndim
);
for
(
auto
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
CHECK_EQ
(
dims
[
i
],
dl_tensor
.
shape
[
i
]);
}
CHECK_EQ
(
dl_tensor
.
strides
==
nullptr
,
true
);
CHECK_EQ
(
static_cast
<
uint64_t
>
(
0
),
dl_tensor
.
byte_offset
);
CHECK_EQ
(
lanes
,
dl_tensor
.
dtype
.
lanes
);
CHECK_EQ
(
sizeof
(
T
)
*
8
,
dl_tensor
.
dtype
.
bits
);
CHECK_EQ
(
GetDLDataTypeCode
<
T
>
(),
dl_tensor
.
dtype
.
code
);
}
template
<
typename
T
>
void
TestMainLoop
()
{
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
(),
platform
::
CUDAPlace
(
0
),
platform
::
CUDAPinnedPlace
()};
if
(
platform
::
GetCUDADeviceCount
()
>
1
)
{
places
.
emplace_back
(
platform
::
CUDAPlace
(
1
));
}
#else
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
()};
#endif
std
::
vector
<
uint16_t
>
lanes
{
1
,
2
};
for
(
auto
&
p
:
places
)
{
for
(
auto
&
l
:
lanes
)
{
TestMain
<
T
>
(
p
,
l
);
}
}
}
#define PADDLE_DLPACK_TEST(type) \
TEST(dlpack, test_##type) { TestMainLoop<type>(); }
using
float16
=
platform
::
float16
;
PADDLE_DLPACK_TEST
(
float16
);
PADDLE_DLPACK_TEST
(
float
);
PADDLE_DLPACK_TEST
(
double
);
PADDLE_DLPACK_TEST
(
int
);
PADDLE_DLPACK_TEST
(
int64_t
);
PADDLE_DLPACK_TEST
(
bool
);
PADDLE_DLPACK_TEST
(
size_t
);
PADDLE_DLPACK_TEST
(
int16_t
);
PADDLE_DLPACK_TEST
(
uint8_t
);
PADDLE_DLPACK_TEST
(
int8_t
);
#undef PADDLE_DLPACK_TEST
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/executor.cc
浏览文件 @
7e4bd695
...
@@ -20,6 +20,7 @@ limitations under the License. */
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
...
@@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
// WhileOp would set keep_kids to
false
// WhileOp would set keep_kids to
true,
//
WhileGradOp would need the scopes created in WhileOp
//
because WhileGradOp needs the scopes created in WhileOp.
// Perhaps, we should not perform eager deletion in WhileOp
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
// in WhileGradOp.
...
...
paddle/fluid/framework/ir/is_test_pass_tester.cc
浏览文件 @
7e4bd695
...
@@ -15,7 +15,10 @@
...
@@ -15,7 +15,10 @@
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#ifdef _WIN32
#undef FALSE
#undef TRUE
#endif
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
...
...
paddle/fluid/framework/lod_tensor.cc
浏览文件 @
7e4bd695
...
@@ -26,10 +26,8 @@ limitations under the License. */
...
@@ -26,10 +26,8 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/memory/memory.h"
#if !defined(_WIN32)
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#include "paddle/fluid/recordio/writer.h"
#endif // _WIN32
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
...
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream
(
is
,
static_cast
<
Tensor
*>
(
tensor
),
dev_ctx
);
TensorFromStream
(
is
,
static_cast
<
Tensor
*>
(
tensor
),
dev_ctx
);
}
}
#if !defined(_WIN32)
void
WriteToRecordIO
(
recordio
::
Writer
*
writer
,
void
WriteToRecordIO
(
recordio
::
Writer
*
writer
,
const
std
::
vector
<
LoDTensor
>
&
tensor
,
const
std
::
vector
<
LoDTensor
>
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
const
platform
::
DeviceContext
&
dev_ctx
)
{
...
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
...
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
return
true
;
return
true
;
}
}
#else
class
Writer
{};
class
Scanner
{};
void
WriteToRecordIO
(
recordio
::
Writer
*
writer
,
const
std
::
vector
<
LoDTensor
>
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{}
bool
ReadFromRecordIO
(
recordio
::
Scanner
*
scanner
,
const
platform
::
DeviceContext
&
dev_ctx
,
std
::
vector
<
LoDTensor
>
*
result_ptr
)
{
PADDLE_ENFORCE
(
"windows didn't supported recordio!."
);
return
true
;
}
#endif // _WIN32
std
::
vector
<
LoDTensor
>
LoDTensor
::
SplitLoDTensor
(
std
::
vector
<
LoDTensor
>
LoDTensor
::
SplitLoDTensor
(
const
std
::
vector
<
platform
::
Place
>
places
)
const
{
const
std
::
vector
<
platform
::
Place
>
places
)
const
{
check_memory_size
();
check_memory_size
();
...
...
paddle/fluid/framework/lod_tensor_test.cc
浏览文件 @
7e4bd695
...
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
...
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
EXPECT_EQ
(
offset_lod
,
expected
);
EXPECT_EQ
(
offset_lod
,
expected
);
}
}
#if !defined(_WIN32)
template
<
typename
T
>
template
<
typename
T
>
static
void
TestRecordIO
()
{
static
void
TestRecordIO
()
{
LoDTensor
tensor
;
LoDTensor
tensor
;
...
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
...
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
TestRecordIO
<
float
>
();
TestRecordIO
<
float
>
();
TestRecordIO
<
double
>
();
TestRecordIO
<
double
>
();
}
}
#endif // !defined(_WIN32)
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/naive_executor.cc
浏览文件 @
7e4bd695
...
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
...
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
for
(
auto
&
op
:
ops_
)
{
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
<<
" on scope "
<<
scope_
;
<<
" on scope "
<<
scope_
;
op
->
SetIsCalledByExecutor
(
false
);
op
->
Run
(
*
scope_
,
place_
);
op
->
Run
(
*
scope_
,
place_
);
}
}
}
}
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
7e4bd695
...
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
...
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
break
;
break
;
}
}
case
proto
::
AttrType
::
LONGS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from LONGS to LONGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int64_t
>
();
break
;
}
case
proto
::
AttrType
::
FLOATS
:
{
case
proto
::
AttrType
::
FLOATS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
<<
" from INTS to FLOATS"
;
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
7e4bd695
...
@@ -22,6 +22,7 @@ limitations under the License. */
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
...
@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
// Combine two hash values to a single hash.
inline
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
std
::
vector
<
std
::
tuple
<
platform
::
Place
,
LibraryType
>>
kKernelPriority
=
{
std
::
vector
<
std
::
tuple
<
platform
::
Place
,
LibraryType
>>
kKernelPriority
=
{
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kCUDNN
),
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kCUDNN
),
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kPlain
),
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kPlain
),
...
@@ -153,17 +149,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
...
@@ -153,17 +149,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif
#endif
}
}
// The profile has a process-wide mutex, results in serious performance issue
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
// Please not remove the `if`, ask @Superjomn if there are any concern.
#ifndef _WIN32
if
(
platform
::
IsProfileEnabled
())
{
if
(
platform
::
IsProfileEnabled
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
RunImpl
(
scope
,
place
);
RunImpl
(
scope
,
place
);
}
else
}
else
{
#endif
{
RunImpl
(
scope
,
place
);
RunImpl
(
scope
,
place
);
}
}
VLOG
(
30
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
30
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
...
@@ -797,17 +790,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
...
@@ -797,17 +790,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope
*
OperatorWithKernel
::
TryTransferData
(
Scope
*
OperatorWithKernel
::
TryTransferData
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
{
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
{
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local
std
::
unordered_map
<
size_t
,
Scope
*>
infer_transfer_scope_cache
;
#endif
Scope
*
new_scope
=
nullptr
;
Scope
*
new_scope
=
nullptr
;
for
(
auto
&
var_name_item
:
Inputs
())
{
for
(
auto
&
var_name_item
:
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
...
@@ -838,23 +820,23 @@ Scope* OperatorWithKernel::TryTransferData(
...
@@ -838,23 +820,23 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG
(
30
)
<<
"Transform Variable "
<<
var_name
<<
" from "
VLOG
(
30
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
#ifdef PADDLE_ON_INFERENCE
// In the inference scenerio, the scopes will be reused across the
size_t
infer_cache_key
=
// batches, so the `new_scope` here will result in GPU memroy explosion
CombineHash
(
OpKernelType
::
Hash
()(
kernel_type_for_var
),
// over the running of operators.
OpKernelType
::
Hash
()(
expected_kernel_key
));
// We use a thread_local cache to fix that issue, the key in the cache is
infer_cache_key
=
// the combination of the `scope` argument, from_kernel_type,
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
&
scope
));
// target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some
auto
it
=
infer_transfer_scope_cache
.
find
(
infer_cache_key
);
// changes on this logic for this macro might not tested on the other
if
(
it
!=
infer_transfer_scope_cache
.
end
())
{
// scenerios.
new_scope
=
infer_transfer_scope_cache
[
infer_cache_key
];
// If this op is not called by an Executor or ParallelExecutor, it should
}
else
{
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
new_scope
=
&
scope
.
NewScope
();
// variables, that behavior a lot different.
infer_transfer_scope_cache
[
infer_cache_key
]
=
new_scope
;
if
(
!
run_by_executor_
)
{
}
new_scope
=
TryCreateTransferScope
(
kernel_type_for_var
,
#endif
expected_kernel_key
,
&
scope
);
}
if
(
new_scope
==
nullptr
)
{
if
(
!
new_scope
)
{
new_scope
=
&
scope
.
NewScope
();
new_scope
=
&
scope
.
NewScope
();
}
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
7e4bd695
...
@@ -127,6 +127,8 @@ class OperatorBase {
...
@@ -127,6 +127,8 @@ class OperatorBase {
//! Get all outputs variable names
//! Get all outputs variable names
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
protected:
protected:
std
::
string
type_
;
std
::
string
type_
;
// NOTE: in case of OpGrad, inputs_ contains:
// NOTE: in case of OpGrad, inputs_ contains:
...
@@ -139,6 +141,8 @@ class OperatorBase {
...
@@ -139,6 +141,8 @@ class OperatorBase {
// IG (Inputs Gradients)
// IG (Inputs Gradients)
VariableNameMap
outputs_
;
VariableNameMap
outputs_
;
AttributeMap
attrs_
;
AttributeMap
attrs_
;
// Whether this operator executes in an Executor.
bool
run_by_executor_
{
true
};
private:
private:
void
GenerateTemporaryNames
();
void
GenerateTemporaryNames
();
...
...
paddle/fluid/framework/transfer_scope_cache.cc
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/transfer_scope_cache.h"
namespace
paddle
{
namespace
framework
{
// Holds all the transfer scope across the process.
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
()
{
typedef
std
::
unordered_map
<
size_t
,
Scope
*>
map_t
;
thread_local
std
::
unique_ptr
<
map_t
>
x
(
new
map_t
);
return
*
x
;
}
// Holds all the transfer scope for this thread.
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
()
{
typedef
std
::
unordered_set
<
Scope
*>
set_t
;
thread_local
std
::
unique_ptr
<
set_t
>
x
(
new
set_t
);
return
*
x
;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
)
{
Scope
*
new_scope
{
nullptr
};
size_t
infer_cache_key
=
CombineHash
(
OpKernelType
::
Hash
()(
type0
),
OpKernelType
::
Hash
()(
type1
));
infer_cache_key
=
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
scope
));
auto
it
=
global_transfer_data_cache
().
find
(
infer_cache_key
);
if
(
it
!=
global_transfer_data_cache
().
end
())
{
new_scope
=
global_transfer_data_cache
()[
infer_cache_key
];
}
else
{
new_scope
=
&
scope
->
NewScope
();
global_transfer_data_cache
()[
infer_cache_key
]
=
new_scope
;
}
global_transfer_scope_cache
().
insert
(
new_scope
);
return
new_scope
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/transfer_scope_cache.h
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread> // NOLINT
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
();
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
();
// Combine two hash values to a single hash.
static
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
);
void
RemoveKidsFromTransferScopeCache
(
Scope
*
scope
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -4,6 +4,7 @@ endif()
...
@@ -4,6 +4,7 @@ endif()
# analysis and tensorrt must be added before creating static library,
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
# otherwise, there would be undefined reference to them in static library.
add_subdirectory
(
analysis
)
add_subdirectory
(
analysis
)
add_subdirectory
(
utils
)
if
(
TENSORRT_FOUND
)
if
(
TENSORRT_FOUND
)
add_subdirectory
(
tensorrt
)
add_subdirectory
(
tensorrt
)
endif
()
endif
()
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
7e4bd695
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/port.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
...
@@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
0.000932706
};
0.000932706
};
const
size_t
num_elements
=
outputs
.
front
().
data
.
length
()
/
sizeof
(
float
);
const
size_t
num_elements
=
outputs
.
front
().
data
.
length
()
/
sizeof
(
float
);
// The outputs' buffers are in CPU memory.
// The outputs' buffers are in CPU memory.
for
(
size_t
i
=
0
;
i
<
std
::
min
(
5UL
,
num_elements
);
i
++
)
{
for
(
size_t
i
=
0
;
i
<
std
::
min
(
(
size_t
)
5UL
,
num_elements
);
i
++
)
{
LOG
(
INFO
)
<<
"data: "
LOG
(
INFO
)
<<
"data: "
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
];
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
];
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
...
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array
)
cc_test
(
test_paddle_inference_api
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
SRCS api_tester.cc
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
7e4bd695
...
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
...
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
prog_file
=
other
.
prog_file
;
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
...
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
prog_file
=
other
.
prog_file
;
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
7e4bd695
...
@@ -31,11 +31,11 @@
...
@@ -31,11 +31,11 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
...
@@ -56,7 +56,6 @@ bool AnalysisPredictor::Init(
...
@@ -56,7 +56,6 @@ bool AnalysisPredictor::Init(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
VLOG
(
30
)
<<
"Predictor::init()"
;
VLOG
(
30
)
<<
"Predictor::init()"
;
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
...
@@ -64,10 +63,9 @@ bool AnalysisPredictor::Init(
...
@@ -64,10 +63,9 @@ bool AnalysisPredictor::Init(
:
platform
::
ProfilerState
::
kCPU
;
:
platform
::
ProfilerState
::
kCPU
;
platform
::
EnableProfiler
(
tracking_device
);
platform
::
EnableProfiler
(
tracking_device
);
}
}
#endif
// no matter with or without MKLDNN
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
!
PrepareScope
(
parent_scope
))
{
if
(
!
PrepareScope
(
parent_scope
))
{
return
false
;
return
false
;
...
@@ -160,6 +158,14 @@ bool AnalysisPredictor::PrepareExecutor() {
...
@@ -160,6 +158,14 @@ bool AnalysisPredictor::PrepareExecutor() {
return
true
;
return
true
;
}
}
void
AnalysisPredictor
::
SetMkldnnThreadID
(
int
tid
)
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
tid
);
#else
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
#endif
}
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
int
batch_size
)
{
...
@@ -167,7 +173,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -167,7 +173,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
inference
::
Timer
timer
;
inference
::
Timer
timer
;
timer
.
tic
();
timer
.
tic
();
// set feed variable
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
?
sub_scope_
:
scope_
.
get
();
framework
::
Scope
*
scope
=
sub_scope_
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
LOG
(
ERROR
)
<<
"fail to set feed"
;
...
@@ -208,17 +213,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
...
@@ -208,17 +213,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
ce_
);
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
ce_
);
}
else
{
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
return
false
;
}
}
if
(
platform
::
is_cpu_place
(
place_
))
{
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
inputs
[
i
].
data
.
length
());
}
else
{
#ifdef PADDLE_WITH_CUDA
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
memory
::
Copy
(
dst_gpu_place
,
static_cast
<
void
*>
(
input_ptr
),
platform
::
CPUPlace
(),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
(),
0
);
// stream 0 for sync copy
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
...
@@ -501,12 +518,10 @@ bool AnalysisPredictor::LoadParameters() {
...
@@ -501,12 +518,10 @@ bool AnalysisPredictor::LoadParameters() {
}
}
AnalysisPredictor
::~
AnalysisPredictor
()
{
AnalysisPredictor
::~
AnalysisPredictor
()
{
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
"./profile.log"
);
"./profile.log"
);
}
}
#endif
if
(
sub_scope_
)
{
if
(
sub_scope_
)
{
scope_
->
DeleteScope
(
sub_scope_
);
scope_
->
DeleteScope
(
sub_scope_
);
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
7e4bd695
...
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
framework
::
Scope
*
scope
()
{
return
scope_
.
get
();
}
framework
::
Scope
*
scope
()
{
return
scope_
.
get
();
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
void
SetMkldnnThreadID
(
int
tid
);
protected:
protected:
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
7e4bd695
...
@@ -24,11 +24,11 @@ limitations under the License. */
...
@@ -24,11 +24,11 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
{
namespace
{
...
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
...
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
bool
NativePaddlePredictor
::
Init
(
bool
NativePaddlePredictor
::
Init
(
std
::
shared_ptr
<
framework
::
Scope
>
parent_scope
)
{
std
::
shared_ptr
<
framework
::
Scope
>
parent_scope
)
{
VLOG
(
3
)
<<
"Predictor::init()"
;
VLOG
(
3
)
<<
"Predictor::init()"
;
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
...
@@ -73,10 +72,9 @@ bool NativePaddlePredictor::Init(
...
@@ -73,10 +72,9 @@ bool NativePaddlePredictor::Init(
:
platform
::
ProfilerState
::
kCPU
;
:
platform
::
ProfilerState
::
kCPU
;
platform
::
EnableProfiler
(
tracking_device
);
platform
::
EnableProfiler
(
tracking_device
);
}
}
#endif
// no matter with or without MKLDNN
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
config_
.
use_gpu
)
{
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
...
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
...
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
}
}
NativePaddlePredictor
::~
NativePaddlePredictor
()
{
NativePaddlePredictor
::~
NativePaddlePredictor
()
{
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
"./profile.log"
);
"./profile.log"
);
}
}
#endif
if
(
sub_scope_
)
{
if
(
sub_scope_
)
{
scope_
->
DeleteScope
(
sub_scope_
);
scope_
->
DeleteScope
(
sub_scope_
);
}
}
...
@@ -139,7 +135,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -139,7 +135,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
Timer
timer
;
Timer
timer
;
timer
.
tic
();
timer
.
tic
();
// set feed variable
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
!=
nullptr
?
sub_scope_
:
scope_
.
get
();
framework
::
Scope
*
scope
=
sub_scope_
!=
nullptr
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
LOG
(
ERROR
)
<<
"fail to set feed"
;
...
@@ -195,17 +190,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
...
@@ -195,17 +190,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
ce_
);
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
ce_
);
}
else
{
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
return
false
;
}
}
if
(
platform
::
is_cpu_place
(
place_
))
{
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
inputs
[
i
].
data
.
length
());
}
else
{
#ifdef PADDLE_WITH_CUDA
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
memory
::
Copy
(
dst_gpu_place
,
static_cast
<
void
*>
(
input_ptr
),
platform
::
CPUPlace
(),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
(),
0
);
// stream 0 for sync copy
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -46,8 +46,6 @@ if(WITH_GPU)
...
@@ -46,8 +46,6 @@ if(WITH_GPU)
endif
()
endif
()
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
endif
()
endif
()
include_directories
(
"D:/Paddle/"
)
include_directories
(
"
${
PADDLE_LIB
}
"
)
include_directories
(
"
${
PADDLE_LIB
}
"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
7e4bd695
...
@@ -15,10 +15,6 @@
...
@@ -15,10 +15,6 @@
#pragma once
#pragma once
#include <glog/logging.h>
#include <glog/logging.h>
#if !defined(_WIN32)
#include <sys/time.h>
#else
#endif
#include <algorithm>
#include <algorithm>
#include <chrono> // NOLINT
#include <chrono> // NOLINT
...
@@ -28,6 +24,7 @@
...
@@ -28,6 +24,7 @@
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
7e4bd695
...
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
...
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
int
max_batch_size
=
1
);
int
max_batch_size
=
1
);
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
void
EnableMKLDNN
();
// NOTE this is just for internal development, please not use it.
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
// NOT stable yet.
void
EnableMKLDNN
();
bool
use_mkldnn
()
const
{
return
use_mkldnn_
;
}
bool
use_mkldnn
()
const
{
return
use_mkldnn_
;
}
friend
class
::
paddle
::
AnalysisPredictor
;
friend
class
::
paddle
::
AnalysisPredictor
;
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
7e4bd695
...
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
...
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
// Specify the variable's name of each input if input tensors don't follow the
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
// `feeds` and `fetches` of the phase `save_inference_model`.
bool
specify_input_name
{
false
};
bool
specify_input_name
{
false
};
// Set and get the number of cpu math library threads.
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
}
protected:
// number of cpu math library (such as MKL, OpenBlas) threads for each
// instance.
int
cpu_math_library_num_threads_
{
1
};
};
};
// A factory to help create different predictors.
// A factory to help create different predictors.
...
...
paddle/fluid/inference/tensorrt/convert/split_op.cc
浏览文件 @
7e4bd695
...
@@ -19,9 +19,6 @@ namespace paddle {
...
@@ -19,9 +19,6 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
/*
* SplitOp.
*/
class
SplitOpConverter
:
public
OpConverter
{
class
SplitOpConverter
:
public
OpConverter
{
public:
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
...
@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
...
@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
std
::
vector
<
int
>
output_lengths
=
std
::
vector
<
int
>
output_lengths
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
// split on batch is not supported in TensorRT
PADDLE_ENFORCE
(
axis
!=
0
);
PADDLE_ENFORCE
(
axis
!=
0
);
if
(
axis
<
0
)
{
axis
+=
(
axis
<
0
)
?
input_dims
.
nbDims
:
-
1
;
axis
+=
input_dims
.
nbDims
;
}
else
{
axis
-=
1
;
}
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
...
...
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
浏览文件 @
7e4bd695
...
@@ -20,30 +20,92 @@ namespace paddle {
...
@@ -20,30 +20,92 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
TEST
(
split_op
,
test
)
{
template
<
int
BatchSize
,
int
Axis
>
void
TensorRTSplitTest
(
const
std
::
vector
<
int
>
&
in_shape
,
const
std
::
vector
<
int
>
&
sections
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
TRTConvertValidation
validator
(
BatchSize
+
1
,
parameters
,
scope
,
10000
);
validator
.
DeclInputVar
(
"split_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out1"
,
nvinfer1
::
DimsCHW
(
2
,
2
,
2
));
auto
make_dim
=
[](
const
std
::
vector
<
int
>
&
shape
)
{
validator
.
DeclOutputVar
(
"split_out2"
,
nvinfer1
::
DimsCHW
(
1
,
2
,
2
));
nvinfer1
::
DimsCHW
dim
;
dim
.
c
()
=
shape
[
0
];
dim
.
h
()
=
shape
[
1
];
dim
.
w
()
=
shape
[
2
];
return
dim
;
};
validator
.
DeclInputVar
(
"split_input"
,
make_dim
(
in_shape
));
std
::
vector
<
std
::
string
>
output_vars
;
for
(
size_t
i
=
0
;
i
<
sections
.
size
();
++
i
)
{
auto
out_shape
=
in_shape
;
out_shape
[
Axis
-
1
]
=
sections
[
i
];
std
::
string
output_name
=
"split_out"
+
std
::
to_string
(
i
);
validator
.
DeclOutputVar
(
output_name
,
make_dim
(
out_shape
));
output_vars
.
push_back
(
output_name
);
}
// Prepare Op description
// Prepare Op description
framework
::
OpDesc
desc
;
framework
::
OpDesc
desc
;
desc
.
SetType
(
"split"
);
desc
.
SetType
(
"split"
);
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetOutput
(
"Out"
,
{
"split_out1"
,
"split_out2"
}
);
desc
.
SetOutput
(
"Out"
,
output_vars
);
int
num
=
0
;
desc
.
SetAttr
(
"axis"
,
Axis
);
int
axis
=
1
;
desc
.
SetAttr
(
"num"
,
0
);
std
::
vector
<
int
>
output_lengths
=
{
2
,
1
};
desc
.
SetAttr
(
"sections"
,
sections
);
desc
.
SetAttr
(
"axis"
,
axis
);
desc
.
SetAttr
(
"num"
,
num
);
desc
.
SetAttr
(
"sections"
,
output_lengths
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
validator
.
Execute
(
BatchSize
);
}
// batch = 0, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 0, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
// batch = 10, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 10, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
}
}
// namespace tensorrt
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
7e4bd695
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -19,6 +21,52 @@ namespace inference {
...
@@ -19,6 +21,52 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
// copied from operators::math::SplitFunctor
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
*
out_cols
,
int
out_cols_size
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
curr_segment
=
0
;
int
curr_offset
=
out_cols
[
0
];
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
while
(
curr_col_offset
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs_data
[
curr_segment
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
fixed_out_col
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
/
fixed_out_col
;
int
in_offset
=
tid_x
-
split
*
fixed_out_col
;
T
*
output_ptr
=
outputs_data
[
split
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
...
@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
...
@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
int
SplitPlugin
::
initialize
()
{
int
SplitPlugin
::
initialize
()
{
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
// notice input dims is [C, H, W]
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
outer_rows_
=
1
;
inner_cols_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
outer_rows_
*=
dims
.
d
[
i
];
}
for
(
int
i
=
axis_
+
1
;
i
<
dims
.
nbDims
;
++
i
)
{
inner_cols_
*=
dims
.
d
[
i
];
}
same_shape_
=
true
;
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
if
(
output_length_
[
i
]
!=
output_length_
[
0
])
{
}
same_shape_
=
false
;
segment_offsets_
=
segment_offsets
;
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
nx_
=
1
;
for
(
int
i
=
dims
.
nbDims
-
1
;
i
>
axis_
;
--
i
)
{
nx_
*=
dims
.
d
[
i
];
}
}
ny_
=
dims
.
d
[
axis_
];
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
nz_
=
1
;
output_length_
[
i
]
*
inner_cols_
);
for
(
int
i
=
axis_
-
1
;
i
>=
0
;
--
i
)
{
nz_
*=
dims
.
d
[
i
];
}
}
inner_cols_
*=
dims
.
d
[
axis_
];
d_segment_offsets_
=
segment_offsets
;
segment_offsets_
=
std
::
move
(
segment_offsets
);
d_output_ptrs_
.
resize
(
this
->
getNbOutputs
(),
nullptr
);
return
0
;
return
0
;
}
}
template
<
typename
T
>
inline
void
Split
(
cudaStream_t
stream
,
const
bool
same_shape
,
const
int
outer_rows
,
const
int
inner_cols
,
const
std
::
vector
<
int
>&
segment_offsets
,
const
int
*
d_segment_offsets
,
const
T
*
input
,
T
**
outputs
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaxBlocks
=
65535
;
int
block_cols
=
kThreadsPerBlock
;
if
(
inner_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
inner_cols
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_cols
=
std
::
min
((
inner_cols
+
block_cols
-
1
)
/
block_cols
,
kMaxBlocks
);
int
grid_rows
=
std
::
min
(
kMaxBlocks
/
grid_cols
,
std
::
max
(
outer_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
same_shape
)
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
segment_offsets
[
1
],
outputs
);
}
else
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
d_segment_offsets
,
static_cast
<
int
>
(
segment_offsets
.
size
()),
outputs
);
}
}
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
float
const
*
input_ptr
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]
);
i
nt
input_size
=
0
;
i
f
(((
batchSize
==
1
&&
axis_
==
0
)
||
axis_
==
-
1
)
&&
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
this
->
getNbOutputs
()
<
10
)
{
float
**
odata
s
=
reinterpret_cast
<
float
**>
(
outputs
);
float
**
output_ptr
s
=
reinterpret_cast
<
float
**>
(
outputs
);
int
data_type_size
=
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
// kernel impl here.
?
sizeof
(
float
)
int
inputBatchOffset
=
nx_
*
ny_
*
nz_
;
:
sizeof
(
__half
)
;
for
(
size_t
i
=
0
;
i
<
this
->
getNbOutputs
();
i
++
)
{
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
batchSize
;
j
++
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
cudaMemcpyAsync
(
odatas
[
i
]
+
output_ptrs
[
i
],
input_ptr
+
segment_offsets_
[
i
],
j
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
data_type_size
,
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
)
==
cudaSuccess
);
inputs
[
0
]
+
}
(
inputBatchOffset
*
j
+
segment_offsets_
[
i
]
*
nx_
)
*
}
else
{
sizeof
(
float
),
outer_rows_
*=
batchSize
;
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
const
int
*
d_segment_offsets_ptr
=
cudaMemcpyDeviceToDevice
,
stream
);
thrust
::
raw_pointer_cast
(
&
d_segment_offsets_
[
0
]);
float
**
output_ptrs
=
thrust
::
raw_pointer_cast
(
&
d_output_ptrs_
[
0
]);
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
,
outputs
,
this
->
getNbOutputs
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
,
stream
)
==
cudaSuccess
);
if
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
input_ptr
,
output_ptrs
);
}
else
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
(
__half
*
)
input_ptr
,
// NOLINT
(
__half
**
)
output_ptrs
);
// NOLINT
}
}
}
}
return
cudaGetLastError
()
!=
cudaSuccess
;
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
7e4bd695
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#pragma once
#pragma once
#include <thrust/device_vector.h>
#include <vector>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
@@ -25,7 +26,7 @@ namespace plugin {
...
@@ -25,7 +26,7 @@ namespace plugin {
class
SplitPlugin
:
public
PluginTensorRT
{
class
SplitPlugin
:
public
PluginTensorRT
{
public:
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{}
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
deserializeBase
(
serial_data
,
serial_length
);
...
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
...
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
}
}
int
axis_
;
int
axis_
;
int
outer_rows_
;
int
inner_cols_
;
bool
same_shape_
;
std
::
vector
<
int
>
output_length_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
std
::
vector
<
int
>
segment_offsets_
;
thrust
::
device_vector
<
int
>
d_segment_offsets_
;
thrust
::
device_vector
<
float
*>
d_output_ptrs_
;
};
};
}
// namespace plugin
}
// namespace plugin
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
7e4bd695
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
浏览文件 @
7e4bd695
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gflags/gflags.h>
#include <gflags/gflags.h>
#include <sys/time.h>
#include <time.h>
#include <time.h>
#include <algorithm>
#include <algorithm>
#include <fstream>
#include <fstream>
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
7e4bd695
...
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
...
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
device
=
0
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
7e4bd695
...
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
...
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
os
<<
GenSpaces
(
num_spaces
)
<<
"specify_input_name: "
<<
config
.
specify_input_name
<<
"
\n
"
;
<<
"specify_input_name: "
<<
config
.
specify_input_name
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"cpu_num_threads: "
<<
config
.
cpu_math_library_num_threads
()
<<
"
\n
"
;
num_spaces
--
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
return
os
;
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
7e4bd695
...
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
...
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode."
);
"Running the inference program in analysis mode."
);
DECLARE_bool
(
profile
);
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -177,11 +178,9 @@ void TestOneThreadPrediction(
...
@@ -177,11 +178,9 @@ void TestOneThreadPrediction(
warmup_timer
.
tic
();
warmup_timer
.
tic
();
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
PrintTime
(
batch_size
,
1
,
1
,
0
,
warmup_timer
.
toc
(),
1
);
PrintTime
(
batch_size
,
1
,
1
,
0
,
warmup_timer
.
toc
(),
1
);
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
paddle
::
platform
::
ResetProfiler
();
}
}
#endif
}
}
LOG
(
INFO
)
<<
"Run "
<<
num_times
<<
" times..."
;
LOG
(
INFO
)
<<
"Run "
<<
num_times
<<
" times..."
;
...
@@ -206,22 +205,23 @@ void TestMultiThreadPrediction(
...
@@ -206,22 +205,23 @@ void TestMultiThreadPrediction(
int
batch_size
=
FLAGS_batch_size
;
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
int
num_times
=
FLAGS_repeat
;
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
auto
main_predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
predictors
.
emplace_back
(
CreateTestPredictor
(
config
,
use_analysis
));
for
(
int
tid
=
1
;
tid
<
num_threads
;
++
tid
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
size_t
total_time
{
0
};
size_t
total_time
{
0
};
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
threads
.
emplace_back
([
&
,
tid
]()
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
static_cast
<
int
>
(
tid
)
+
1
);
#endif
// Each thread should have local inputs and outputs.
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
// The inputs of each thread are all the same.
std
::
vector
<
PaddleTensor
>
outputs_tid
;
std
::
vector
<
PaddleTensor
>
outputs_tid
;
auto
&
predictor
=
predictors
[
tid
];
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
main_predictor
->
Clone
();
#ifdef PADDLE_WITH_MKLDNN
if
(
use_analysis
)
{
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
SetMkldnnThreadID
(
static_cast
<
int
>
(
tid
)
+
1
);
}
#endif
// warmup run
// warmup run
LOG
(
INFO
)
<<
"Running thread "
<<
tid
<<
", warm up run..."
;
LOG
(
INFO
)
<<
"Running thread "
<<
tid
<<
", warm up run..."
;
...
@@ -230,11 +230,9 @@ void TestMultiThreadPrediction(
...
@@ -230,11 +230,9 @@ void TestMultiThreadPrediction(
warmup_timer
.
tic
();
warmup_timer
.
tic
();
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
PrintTime
(
batch_size
,
1
,
num_threads
,
tid
,
warmup_timer
.
toc
(),
1
);
PrintTime
(
batch_size
,
1
,
num_threads
,
tid
,
warmup_timer
.
toc
(),
1
);
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
paddle
::
platform
::
ResetProfiler
();
}
}
#endif
}
}
LOG
(
INFO
)
<<
"Thread "
<<
tid
<<
" run "
<<
num_times
<<
" times..."
;
LOG
(
INFO
)
<<
"Thread "
<<
tid
<<
" run "
<<
num_times
<<
" times..."
;
...
...
paddle/fluid/inference/tests/book/test_inference_nlp.cc
浏览文件 @
7e4bd695
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <sys/time.h>
#include <time.h>
#include <time.h>
#include <fstream>
#include <fstream>
#include <thread> // NOLINT
#include <thread> // NOLINT
...
...
paddle/fluid/inference/tests/test_helper.h
浏览文件 @
7e4bd695
...
@@ -20,6 +20,7 @@ limitations under the License. */
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
use_mkldnn
);
DECLARE_bool
(
use_mkldnn
);
...
...
paddle/fluid/inference/utils/CMakeLists.txt
0 → 100644
浏览文件 @
7e4bd695
cc_library
(
benchmark SRCS benchmark.cc DEPS enforce
)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
paddle/fluid/inference/utils/benchmark.cc
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
std
::
string
Benchmark
::
SerializeToString
()
const
{
std
::
stringstream
ss
;
ss
<<
"-----------------------------------------------------
\n
"
;
ss
<<
"name
\t
"
;
ss
<<
"batch_size
\t
"
;
ss
<<
"num_threads
\t
"
;
ss
<<
"latency
\t
"
;
ss
<<
"qps"
;
ss
<<
'\n'
;
ss
<<
name_
<<
"
\t
"
;
ss
<<
batch_size_
<<
"
\t
"
;
ss
<<
num_threads_
<<
"
\t
"
;
ss
<<
latency_
<<
"
\t
"
;
ss
<<
1000
/
latency_
;
ss
<<
'\n'
;
return
ss
.
str
();
}
void
Benchmark
::
PersistToFile
(
const
std
::
string
&
path
)
const
{
std
::
ofstream
file
(
path
,
std
::
ios
::
app
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"Can not open %s to add benchmark"
,
path
);
file
<<
SerializeToString
();
file
.
flush
();
file
.
close
();
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark.h
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
namespace
paddle
{
namespace
inference
{
/*
* Helper class to calculate the performance.
*/
struct
Benchmark
{
int
batch_size
()
const
{
return
batch_size_
;
}
void
SetBatchSize
(
int
x
)
{
batch_size_
=
x
;
}
int
num_threads
()
const
{
return
num_threads_
;
}
void
SetNumThreads
(
int
x
)
{
num_threads_
=
x
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
void
SetUseGpu
()
{
use_gpu_
=
true
;
}
int
latency
()
const
{
return
latency_
;
}
void
SetLatency
(
int
x
)
{
latency_
=
x
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
std
::
string
SerializeToString
()
const
;
void
PersistToFile
(
const
std
::
string
&
path
)
const
;
private:
bool
use_gpu_
{
false
};
int
batch_size_
{
0
};
int
latency_
;
int
num_threads_
{
1
};
std
::
string
name_
;
};
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark_tester.cc
0 → 100644
浏览文件 @
7e4bd695
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
using
namespace
paddle
::
inference
;
TEST
(
Benchmark
,
basic
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
LOG
(
INFO
)
<<
"benchmark:
\n
"
<<
benchmark
.
SerializeToString
();
}
TEST
(
Benchmark
,
PersistToFile
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
}
\ No newline at end of file
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
7e4bd695
...
@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
...
@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t
thread_num
=
32
;
size_t
thread_num
=
32
;
size_t
sleep_time
=
40
;
size_t
sleep_time
=
40
;
size_t
extra_time
=
2
;
size_t
extra_time
=
10
;
// Reserve to perform more tests in the future
// Reserve to perform more tests in the future
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
...
...
paddle/fluid/operators/beam_search_op_test.cc
浏览文件 @
7e4bd695
...
@@ -46,7 +46,7 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
...
@@ -46,7 +46,7 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
auto
*
scores_data
=
scores
->
mutable_data
<
float
>
(
place
);
auto
*
scores_data
=
scores
->
mutable_data
<
float
>
(
place
);
vector
<
int64_t
>
_ids
({
4
,
2
,
5
,
2
,
1
,
3
,
3
,
5
,
2
,
8
,
2
,
1
});
vector
<
int64_t
>
_ids
({
4
,
2
,
5
,
2
,
1
,
3
,
3
,
5
,
2
,
8
,
2
,
1
});
vector
<
float
>
_scores
(
vector
<
float
>
_scores
(
{
0.5
,
0.3
,
0.2
,
0.6
,
0.3
,
0.1
,
0.9
,
0.5
,
0.1
,
0.7
,
0.5
,
0.1
});
{
0.5
f
,
0.3
f
,
0.2
f
,
0.6
f
,
0.3
f
,
0.1
f
,
0.9
f
,
0.5
f
,
0.1
f
,
0.7
f
,
0.5
f
,
0.1
f
});
for
(
int
i
=
0
;
i
<
12
;
i
++
)
{
for
(
int
i
=
0
;
i
<
12
;
i
++
)
{
ids_data
[
i
]
=
_ids
[
i
];
ids_data
[
i
]
=
_ids
[
i
];
...
@@ -80,7 +80,7 @@ TEST(DISABLED_beam_search_op, run) {
...
@@ -80,7 +80,7 @@ TEST(DISABLED_beam_search_op, run) {
ASSERT_EQ
(
sids
.
lod
(),
sscores
.
lod
());
ASSERT_EQ
(
sids
.
lod
(),
sscores
.
lod
());
vector
<
int
>
tids
({
4
,
2
,
3
,
8
});
vector
<
int
>
tids
({
4
,
2
,
3
,
8
});
vector
<
float
>
tscores
({
0.5
,
0.6
,
0.9
,
0.7
});
vector
<
float
>
tscores
({
0.5
f
,
0.6
f
,
0.9
f
,
0.7
f
});
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
ASSERT_EQ
(
tids
[
i
],
sids
.
data
<
int64_t
>
()[
i
]);
ASSERT_EQ
(
tids
[
i
],
sids
.
data
<
int64_t
>
()[
i
]);
...
...
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
...
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
detection_library
(
mine_hard_examples_op SRCS mine_hard_examples_op.cc
)
detection_library
(
mine_hard_examples_op SRCS mine_hard_examples_op.cc
)
detection_library
(
multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc
)
detection_library
(
multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op.cu
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op.cu
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu
)
detection_library
(
anchor_generator_op SRCS anchor_generator_op.cc
detection_library
(
anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu
)
anchor_generator_op.cu
)
detection_library
(
target_assign_op SRCS target_assign_op.cc
detection_library
(
target_assign_op SRCS target_assign_op.cc
...
...
paddle/fluid/operators/detection/density_prior_box_op.cc
浏览文件 @
7e4bd695
...
@@ -39,17 +39,15 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
...
@@ -39,17 +39,15 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
auto
fixed_sizes
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_sizes
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
fixed_ratios
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"densities"
);
auto
densities
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"densities"
);
bool
flatten
=
ctx
->
Attrs
().
Get
<
bool
>
(
"flatten_to_2d"
);
PADDLE_ENFORCE_EQ
(
fixed_sizes
.
size
(),
densities
.
size
(),
PADDLE_ENFORCE_EQ
(
fixed_sizes
.
size
(),
densities
.
size
(),
"The number of fixed_sizes and densities must be equal."
);
"The number of fixed_sizes and densities must be equal."
);
size_t
num_priors
=
0
;
size_t
num_priors
=
0
;
if
((
fixed_sizes
.
size
()
>
0
)
&&
(
densities
.
size
()
>
0
))
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
}
if
(
!
flatten
)
{
}
std
::
vector
<
int64_t
>
dim_vec
(
4
);
std
::
vector
<
int64_t
>
dim_vec
(
4
);
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
1
]
=
input_dims
[
3
];
dim_vec
[
1
]
=
input_dims
[
3
];
...
@@ -57,6 +55,11 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
...
@@ -57,6 +55,11 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
dim_vec
[
3
]
=
4
;
dim_vec
[
3
]
=
4
;
ctx
->
SetOutputDim
(
"Boxes"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Boxes"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Variances"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Variances"
,
framework
::
make_ddim
(
dim_vec
));
}
else
{
int64_t
dim0
=
input_dims
[
2
]
*
input_dims
[
3
]
*
num_priors
;
ctx
->
SetOutputDim
(
"Boxes"
,
{
dim0
,
4
});
ctx
->
SetOutputDim
(
"Variances"
,
{
dim0
,
4
});
}
}
}
protected:
protected:
...
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
...
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"Input"
)
->
type
()),
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"Input"
)
->
type
()),
platform
::
CPU
Place
());
ctx
.
Get
Place
());
}
}
};
};
...
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
});
});
AddAttr
<
bool
>
(
"clip"
,
"(bool) Whether to clip out-of-boundary boxes."
)
AddAttr
<
bool
>
(
"clip"
,
"(bool) Whether to clip out-of-boundary boxes."
)
.
SetDefault
(
true
);
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"flatten_to_2d"
,
"(bool) Whether to flatten to 2D and "
"the second dim is 4."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
AddAttr
<
float
>
(
"step_w"
,
"step_w"
,
"Density prior boxes step across width, 0.0 for auto calculation."
)
"Density prior boxes step across width, 0.0 for auto calculation."
)
...
...
paddle/fluid/operators/detection/density_prior_box_op.cu
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/density_prior_box_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
static
__device__
inline
T
Clip
(
T
in
)
{
return
min
(
max
(
in
,
0.
),
1.
);
}
template
<
typename
T
>
static
__global__
void
GenDensityPriorBox
(
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
T
offset
,
const
T
step_width
,
const
T
step_height
,
const
int
num_priors
,
const
T
*
ratios_shift
,
bool
is_clip
,
const
T
var_xmin
,
const
T
var_ymin
,
const
T
var_xmax
,
const
T
var_ymax
,
T
*
out
,
T
*
var
)
{
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
gidy
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
step_x
=
blockDim
.
x
*
gridDim
.
x
;
int
step_y
=
blockDim
.
y
*
gridDim
.
y
;
const
T
*
width_ratio
=
ratios_shift
;
const
T
*
height_ratio
=
ratios_shift
+
num_priors
;
const
T
*
width_shift
=
ratios_shift
+
2
*
num_priors
;
const
T
*
height_shift
=
ratios_shift
+
3
*
num_priors
;
for
(
int
j
=
gidy
;
j
<
height
;
j
+=
step_y
)
{
for
(
int
i
=
gidx
;
i
<
width
*
num_priors
;
i
+=
step_x
)
{
int
h
=
j
;
int
w
=
i
/
num_priors
;
int
k
=
i
%
num_priors
;
T
center_x
=
(
w
+
offset
)
*
step_width
;
T
center_y
=
(
h
+
offset
)
*
step_height
;
T
center_x_temp
=
center_x
+
width_shift
[
k
];
T
center_y_temp
=
center_y
+
height_shift
[
k
];
T
box_width_ratio
=
width_ratio
[
k
]
/
2.
;
T
box_height_ratio
=
height_ratio
[
k
]
/
2.
;
T
xmin
=
max
((
center_x_temp
-
box_width_ratio
)
/
im_width
,
0.
);
T
ymin
=
max
((
center_y_temp
-
box_height_ratio
)
/
im_height
,
0.
);
T
xmax
=
min
((
center_x_temp
+
box_width_ratio
)
/
im_width
,
1.
);
T
ymax
=
min
((
center_y_temp
+
box_height_ratio
)
/
im_height
,
1.
);
int
out_offset
=
(
j
*
width
*
num_priors
+
i
)
*
4
;
out
[
out_offset
]
=
is_clip
?
Clip
<
T
>
(
xmin
)
:
xmin
;
out
[
out_offset
+
1
]
=
is_clip
?
Clip
<
T
>
(
ymin
)
:
ymin
;
out
[
out_offset
+
2
]
=
is_clip
?
Clip
<
T
>
(
xmax
)
:
xmax
;
out
[
out_offset
+
3
]
=
is_clip
?
Clip
<
T
>
(
ymax
)
:
ymax
;
var
[
out_offset
]
=
var_xmin
;
var
[
out_offset
+
1
]
=
var_ymin
;
var
[
out_offset
+
2
]
=
var_xmax
;
var
[
out_offset
+
3
]
=
var_ymax
;
}
}
}
template
<
typename
T
>
class
DensityPriorBoxOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Input"
);
auto
*
image
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Image"
);
auto
*
boxes
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Boxes"
);
auto
*
vars
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Variances"
);
auto
variances
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"variances"
);
auto
is_clip
=
ctx
.
Attr
<
bool
>
(
"clip"
);
auto
fixed_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"densities"
);
T
step_w
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_w"
));
T
step_h
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_h"
));
T
offset
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"offset"
));
auto
img_width
=
image
->
dims
()[
3
];
auto
img_height
=
image
->
dims
()[
2
];
auto
feature_width
=
input
->
dims
()[
3
];
auto
feature_height
=
input
->
dims
()[
2
];
T
step_width
,
step_height
;
if
(
step_w
==
0
||
step_h
==
0
)
{
step_width
=
static_cast
<
T
>
(
img_width
)
/
feature_width
;
step_height
=
static_cast
<
T
>
(
img_height
)
/
feature_height
;
}
else
{
step_width
=
step_w
;
step_height
=
step_h
;
}
int
num_priors
=
0
;
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
framework
::
Tensor
h_temp
;
T
*
tdata
=
h_temp
.
mutable_data
<
T
>
({
num_priors
*
4
},
platform
::
CPUPlace
());
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
fixed_sizes
.
size
();
++
s
)
{
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
float
box_width_ratio
=
fixed_size
*
sqrt
(
ar
);
float
box_height_ratio
=
fixed_size
/
sqrt
(
ar
);
for
(
int
di
=
0
;
di
<
density
;
++
di
)
{
for
(
int
dj
=
0
;
dj
<
density
;
++
dj
)
{
float
center_x_temp
=
shift
/
2.
+
dj
*
shift
-
step_average
/
2.
;
float
center_y_temp
=
shift
/
2.
+
di
*
shift
-
step_average
/
2.
;
tdata
[
idx
]
=
box_width_ratio
;
tdata
[
num_priors
+
idx
]
=
box_height_ratio
;
tdata
[
2
*
num_priors
+
idx
]
=
center_x_temp
;
tdata
[
3
*
num_priors
+
idx
]
=
center_y_temp
;
idx
++
;
}
}
}
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
Tensor
d_temp
;
framework
::
TensorCopySync
(
h_temp
,
ctx
.
GetPlace
(),
&
d_temp
);
// At least use 32 threads, at most 512 threads.
// blockx is multiple of 32.
int
blockx
=
std
::
min
(((
feature_width
*
num_priors
+
31
)
>>
5
)
<<
5
,
512L
);
int
gridx
=
(
feature_width
*
num_priors
+
blockx
-
1
)
/
blockx
;
dim3
threads
(
blockx
,
1
);
dim3
grids
(
gridx
,
feature_height
);
auto
stream
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>().
stream
();
GenDensityPriorBox
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
feature_height
,
feature_width
,
img_height
,
img_width
,
offset
,
step_width
,
step_height
,
num_priors
,
d_temp
.
data
<
T
>
(),
is_clip
,
variances
[
0
],
variances
[
1
],
variances
[
2
],
variances
[
3
],
boxes
->
data
<
T
>
(),
vars
->
data
<
T
>
());
}
};
// namespace operators
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
density_prior_box
,
ops
::
DensityPriorBoxOpCUDAKernel
<
float
>
,
ops
::
DensityPriorBoxOpCUDAKernel
<
double
>
);
paddle/fluid/operators/detection/density_prior_box_op.h
浏览文件 @
7e4bd695
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
...
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
step_height
=
step_h
;
step_height
=
step_h
;
}
}
int
num_priors
=
0
;
int
num_priors
=
0
;
if
(
fixed_sizes
.
size
()
>
0
&&
densities
.
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
}
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
auto
box_dim
=
vars
->
dims
();
boxes
->
Resize
({
feature_height
,
feature_width
,
num_priors
,
4
});
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
...
@@ -76,7 +74,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -76,7 +74,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
auto
fixed_size
=
fixed_sizes
[
s
];
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
int
density
=
densities
[
s
];
// Generate density prior boxes with fixed ratios.
// Generate density prior boxes with fixed ratios.
if
(
fixed_ratios
.
size
()
>
0
)
{
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
int
shift
=
step_average
/
density
;
...
@@ -111,7 +108,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -111,7 +108,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
}
}
}
}
}
}
}
if
(
clip
)
{
if
(
clip
)
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
ClipFunctor
<
T
>
clip_func
;
ClipFunctor
<
T
>
clip_func
;
...
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
e_vars
=
var_et
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
box_num
,
1
));
e_vars
=
var_et
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
box_num
,
1
));
vars
->
Resize
(
var_dim
);
vars
->
Resize
(
var_dim
);
boxes
->
Resize
(
box_dim
);
}
}
};
// namespace operators
};
// namespace operators
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
7e4bd695
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <sys/time.h>
#include <limits>
#include <limits>
#include "glog/logging.h" // For VLOG
#include "glog/logging.h" // For VLOG
...
@@ -20,8 +19,11 @@ limitations under the License. */
...
@@ -20,8 +19,11 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
...
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
...
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
// Channel configurations:
// Channel configurations:
grpc
::
ChannelArguments
args
;
grpc
::
ChannelArguments
args
;
args
.
SetInt
(
GRPC_ARG_MAX_RECONNECT_BACKOFF_MS
,
2000
);
args
.
SetInt
(
GRPC_ARG_MAX_RECONNECT_BACKOFF_MS
,
2000
);
if
(
FLAGS_rpc_disable_reuse_port
)
{
args
.
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
7e4bd695
...
@@ -15,7 +15,6 @@ limitations under the License. */
...
@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#include <nccl.h>
#endif
#endif
#include <sys/time.h>
#include <thread> // NOLINT
#include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/coded_stream.h"
...
@@ -26,6 +25,7 @@ limitations under the License. */
...
@@ -26,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/operators/distributed/grpc_serde.h
浏览文件 @
7e4bd695
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <sys/time.h>
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -25,6 +25,7 @@ limitations under the License. */
...
@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
7e4bd695
...
@@ -20,6 +20,8 @@ limitations under the License. */
...
@@ -20,6 +20,8 @@ limitations under the License. */
using
::
grpc
::
ServerAsyncResponseWriter
;
using
::
grpc
::
ServerAsyncResponseWriter
;
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
...
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
...
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
VLOG
(
40
)
<<
"AsyncGRPCServer WaitSeverReady"
;
VLOG
(
40
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
}
// Define an option subclass in order to disable SO_REUSEPORT for the
// server socket.
// Come from:
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
class
NoReusePortOption
:
public
::
grpc
::
ServerBuilderOption
{
public:
void
UpdateArguments
(
::
grpc
::
ChannelArguments
*
args
)
override
{
args
->
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
void
UpdatePlugins
(
std
::
vector
<
std
::
unique_ptr
<::
grpc
::
ServerBuilderPlugin
>>*
plugins
)
override
{}
};
void
AsyncGRPCServer
::
StartServer
()
{
void
AsyncGRPCServer
::
StartServer
()
{
::
grpc
::
ServerBuilder
builder
;
::
grpc
::
ServerBuilder
builder
;
builder
.
AddListeningPort
(
bind_address_
,
::
grpc
::
InsecureServerCredentials
(),
builder
.
AddListeningPort
(
bind_address_
,
::
grpc
::
InsecureServerCredentials
(),
...
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
...
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
builder
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
builder
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
builder
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
builder
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
if
(
FLAGS_rpc_disable_reuse_port
)
{
builder
.
SetOption
(
std
::
unique_ptr
<::
grpc
::
ServerBuilderOption
>
(
new
NoReusePortOption
));
}
builder
.
RegisterService
(
&
service_
);
builder
.
RegisterService
(
&
service_
);
for
(
auto
t
:
rpc_call_map_
)
{
for
(
auto
t
:
rpc_call_map_
)
{
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
7e4bd695
...
@@ -15,12 +15,14 @@ limitations under the License. */
...
@@ -15,12 +15,14 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#include <nccl.h>
#endif
#endif
#include <sys/time.h>
#include <thread> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/port.h"
DEFINE_bool
(
rpc_disable_reuse_port
,
false
,
"Disable SO_REUSEPORT or not."
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.h
浏览文件 @
7e4bd695
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <sys/time.h>
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -24,6 +23,7 @@ limitations under the License. */
...
@@ -24,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
7e4bd695
include
(
operators
)
include
(
operators
)
register_operators
()
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op
)
if
(
WITH_GPU
)
op_library
(
fusion_transpose_flatten_concat_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);
\n
"
)
endif
()
paddle/fluid/operators/fused/fusion_gru_op.cc
浏览文件 @
7e4bd695
...
@@ -192,11 +192,14 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -192,11 +192,14 @@ class FusionGRUKernel : public framework::OpKernel<T> {
const int M = x_dims[1]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D2 = D * 2; \
const auto& ker = math::jitkernel::KernelPool::Instance() \
const math::jitkernel::gru_attr_t attr( \
D, ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("activation")); \
math::jitkernel::gru_t one_step; \
const auto& ker = \
math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::GRUKernel<T>, \
.template Get<math::jitkernel::GRUKernel<T>, \
const std::string&, const std::string&>( \
const math::jitkernel::gru_attr_t&>(attr); \
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("activation"), D); \
const T* x_data = x->data<T>(); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
const T* wh_data = wh->data<T>(); \
...
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
if
(
h0_data
)
{
if
(
h0_data
)
{
prev_hidden_data
=
h0_data
+
bid
*
D
;
prev_hidden_data
=
h0_data
+
bid
*
D
;
}
else
{
}
else
{
ker
->
ComputeH1
(
xx_data
,
hidden_out_data
);
one_step
.
gates
=
xx_data
;
one_step
.
ht
=
hidden_out_data
;
ker
->
ComputeH1
(
&
one_step
,
&
attr
);
prev_hidden_data
=
hidden_out_data
;
prev_hidden_data
=
hidden_out_data
;
tstart
=
1
;
tstart
=
1
;
move_step
();
move_step
();
...
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D2
,
D
,
static_cast
<
T
>
(
1
),
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D2
,
D
,
static_cast
<
T
>
(
1
),
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
xx_data
,
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
xx_data
,
D3
);
D3
);
ker
->
ComputeHtPart1
(
xx_data
,
prev_hidden_data
,
hidden_out_data
);
one_step
.
gates
=
xx_data
;
one_step
.
ht_1
=
prev_hidden_data
;
one_step
.
ht
=
hidden_out_data
;
ker
->
ComputeHtPart1
(
&
one_step
,
&
attr
);
// gemm rt * Ws
// gemm rt * Ws
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D
,
D
,
static_cast
<
T
>
(
1
),
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D
,
D
,
static_cast
<
T
>
(
1
),
hidden_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
hidden_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
xx_data
+
D2
,
D3
);
xx_data
+
D2
,
D3
);
ker
->
ComputeHtPart2
(
xx_data
,
prev_hidden_data
,
hidden_out_data
);
ker
->
ComputeHtPart2
(
&
one_step
,
&
attr
);
// save prev
// save prev
prev_hidden_data
=
hidden_out_data
;
prev_hidden_data
=
hidden_out_data
;
move_step
();
move_step
();
...
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T
*
cur_out_data
=
batched_out_data
;
T
*
cur_out_data
=
batched_out_data
;
// W: {W_update, W_reset; W_state}
// W: {W_update, W_reset; W_state}
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
ker
->
ComputeH1
(
cur_in_data
,
cur_out_data
);
one_step
.
gates
=
cur_in_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeH1
(
&
one_step
,
&
attr
);
// add offset
// add offset
cur_in_data
+=
D3
;
cur_in_data
+=
D3
;
cur_out_data
+=
D
;
cur_out_data
+=
D
;
...
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T
*
cur_out_data
=
batched_out_data
;
T
*
cur_out_data
=
batched_out_data
;
T
*
cur_prev_hidden_data
=
prev_hidden_data
;
T
*
cur_prev_hidden_data
=
prev_hidden_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
ker
->
ComputeHtPart1
(
cur_batched_data
,
cur_prev_hidden_data
,
one_step
.
gates
=
cur_batched_data
;
cur_out_data
);
one_step
.
ht_1
=
cur_prev_hidden_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeHtPart1
(
&
one_step
,
&
attr
);
cur_batched_data
+=
D3
;
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
cur_out_data
+=
D
;
...
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
cur_prev_hidden_data
=
prev_hidden_data
;
cur_prev_hidden_data
=
prev_hidden_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
ker
->
ComputeHtPart2
(
cur_batched_data
,
cur_prev_hidden_data
,
one_step
.
gates
=
cur_batched_data
;
cur_out_data
);
one_step
.
ht_1
=
cur_prev_hidden_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeHtPart2
(
&
one_step
,
&
attr
);
cur_batched_data
+=
D3
;
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
cur_out_data
+=
D
;
...
...
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
7e4bd695
...
@@ -250,13 +250,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -250,13 +250,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
} \
const math::jitkernel::lstm_attr_t attr( \
D, ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
ctx.Attr<std::string>("cell_activation"), use_peepholes); \
math::jitkernel::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
const auto& ker = \
const auto& ker = \
math::jitkernel::KernelPool::Instance() \
math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
.template Get<math::jitkernel::LSTMKernel<T>, \
const std::string&, const std::string&>( \
const math::jitkernel::lstm_attr_t&>(attr)
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
// Wh GEMM
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \
#define GEMM_WH_ADDON(bs, prev, out) \
...
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_h_data
=
h0_data
+
bid
*
D
;
prev_h_data
=
h0_data
+
bid
*
D
;
prev_c_data
=
c0_data
+
bid
*
D
;
prev_c_data
=
c0_data
+
bid
*
D
;
}
else
{
}
else
{
ker
->
ComputeC1H1
(
xx_data
,
c_out_data
,
h_out_data
,
wp_data
);
one_step
.
gates
=
xx_data
;
one_step
.
ct
=
c_out_data
;
one_step
.
ht
=
h_out_data
;
ker
->
ComputeC1H1
(
&
one_step
,
&
attr
);
tstart
=
1
;
tstart
=
1
;
// move one step
// move one step
prev_h_data
=
h_out_data
;
prev_h_data
=
h_out_data
;
...
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
}
}
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
ker
->
ComputeCtHt
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
,
wp_data
,
checked_cell_data
);
one_step
.
gates
=
xx_data
;
one_step
.
ct_1
=
prev_c_data
;
one_step
.
ct
=
c_out_data
;
one_step
.
ht
=
h_out_data
;
ker
->
ComputeCtHt
(
&
one_step
,
&
attr
);
// move one step
// move one step
prev_h_data
=
h_out_data
;
prev_h_data
=
h_out_data
;
prev_c_data
=
c_out_data
;
prev_c_data
=
c_out_data
;
...
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
ker
->
ComputeC1H1
(
cur_in_data
,
cur_c_out_data
,
cur_h_out_data
,
wp_data
);
one_step
.
gates
=
cur_in_data
;
one_step
.
ct
=
cur_c_out_data
;
one_step
.
ht
=
cur_h_out_data
;
ker
->
ComputeC1H1
(
&
one_step
,
&
attr
);
cur_in_data
+=
D4
;
cur_in_data
+=
D4
;
cur_c_out_data
+=
D
;
cur_c_out_data
+=
D
;
cur_h_out_data
+=
D
;
cur_h_out_data
+=
D
;
...
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
ker
->
ComputeCtHt
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
one_step
.
gates
=
cur_in_data
;
cur_h_out_data
,
wp_data
,
checked_cell_data
);
one_step
.
ct_1
=
cur_prev_c_data
;
one_step
.
ct
=
cur_c_out_data
;
one_step
.
ht
=
cur_h_out_data
;
ker
->
ComputeCtHt
(
&
one_step
,
&
attr
);
// move one batch
// move one batch
cur_in_data
+=
D4
;
cur_in_data
+=
D4
;
cur_prev_c_data
+=
D
;
cur_prev_c_data
+=
D
;
...
...
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
class
TransposeFlattenConcatFusionOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_GE
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of ConcatOp should be empty."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ConcatOp should not be null."
);
auto
ins
=
ctx
->
GetInputsDim
(
"X"
);
const
size_t
n
=
ins
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
std
::
vector
<
int
>
trans_axis
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"concat_axis"
);
size_t
x_rank
=
ins
[
0
].
size
();
size_t
trans_axis_size
=
trans_axis
.
size
();
PADDLE_ENFORCE_EQ
(
x_rank
,
trans_axis_size
,
"The input tensor's rank(%d) "
"should be equal to the permutation axis's size(%d)"
,
x_rank
,
trans_axis_size
);
auto
dims0
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
0
]));
std
::
vector
<
int
>
out_dims
(
dims0
);
for
(
size_t
i
=
1
;
i
<
n
;
i
++
)
{
auto
dimsi
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
i
]));
for
(
int
j
=
0
;
j
<
static_cast
<
int
>
(
dims0
.
size
());
j
++
)
{
if
(
j
==
concat_axis
)
{
out_dims
[
concat_axis
]
+=
dimsi
[
j
];
}
else
{
PADDLE_ENFORCE_EQ
(
out_dims
[
j
],
dimsi
[
j
],
"After flatting, the %d-th dim should be save "
"except the specify axis."
,
j
);
}
}
}
if
(
out_dims
[
concat_axis
]
<
0
)
{
out_dims
[
concat_axis
]
=
-
1
;
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
}
};
class
TransposeFlattenConcatFusionOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) The input tensor, tensors with rank up to 6 are supported."
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor)The output tensor."
);
AddAttr
<
std
::
vector
<
int
>>
(
"trans_axis"
,
"(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."
);
AddAttr
<
int
>
(
"flatten_axis"
,
"(int)"
"Indicate up to which input dimensions (exclusive) should be"
"flattened to the outer dimension of the output. The value"
"for axis must be in the range [0, R], where R is the rank of"
"the input tensor. When axis = 0, the shape of the output"
"tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
"input tensor is (d_0, d_1, ... d_n)."
);
AddAttr
<
int
>
(
"concat_axis"
,
"The axis along which the input tensors will be concatenated. "
"It should be 0 or 1, since the tensor is 2D after flatting."
);
AddComment
(
R"DOC(
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionOp
,
ops
::
TransposeFlattenConcatFusionOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
class
TransposeFlattenConcatFusionKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
odims
=
out
->
dims
();
std
::
vector
<
int
>
trans_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
.
Attr
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
.
Attr
<
int
>
(
"concat_axis"
);
int
rank
=
ins
[
0
]
->
dims
().
size
();
// use at least 4D in cudnnTransformTensor
int
max_dim
=
rank
<
4
?
4
:
rank
;
std
::
vector
<
int
>
stride_x
(
max_dim
,
0
);
std
::
vector
<
int
>
stride_y
(
max_dim
,
0
);
std
::
vector
<
int
>
dims_y
(
max_dim
,
0
);
cudnnTensorDescriptor_t
in_desc
;
cudnnTensorDescriptor_t
out_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
));
cudnnDataType_t
cudnn_dtype
=
CudnnDataType
<
T
>::
type
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
T
*
odata
=
out
->
data
<
T
>
();
for
(
size_t
k
=
0
;
k
<
ins
.
size
();
++
k
)
{
auto
perm_shape
=
GetPermuteShape
(
trans_axis
,
ins
[
k
]
->
dims
());
int
osize
=
1
;
auto
idims
=
ins
[
k
]
->
dims
();
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
stride_x
[
i
]
=
1
;
for
(
int
j
=
trans_axis
[
i
]
+
1
;
j
<
rank
;
j
++
)
{
stride_x
[
i
]
*=
idims
[
j
];
}
dims_y
[
i
]
=
perm_shape
[
i
];
osize
*=
perm_shape
[
i
];
}
stride_y
[
rank
-
1
]
=
1
;
for
(
int
i
=
rank
-
2
;
i
>=
0
;
i
--
)
{
if
(((
i
+
1
)
==
flatten_axis
)
&&
(
concat_axis
==
1
))
{
stride_y
[
i
]
=
odims
[
1
];
}
else
{
stride_y
[
i
]
=
stride_y
[
i
+
1
]
*
perm_shape
[
i
+
1
];
}
}
// Since concat is aftern flatten, the output is 2D tensor.
// If concat_axis is 0, each input's permutated tensor is continuous.
// If concat_axis is 1, the stride of 0-th dim of each input's
// permutated tensor is odims()[1].
for
(
int
i
=
rank
;
i
<
max_dim
;
i
++
)
{
stride_x
[
i
]
=
1
;
stride_y
[
i
]
=
1
;
dims_y
[
i
]
=
1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_x
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_y
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
in_desc
,
static_cast
<
const
void
*>
(
ins
[
k
]
->
data
<
T
>
()),
CudnnDataType
<
T
>::
kZero
(),
out_desc
,
static_cast
<
void
*>
(
odata
)));
if
(
concat_axis
==
0
)
{
odata
+=
osize
;
}
else
{
auto
flat_shape
=
GetFlattenShape
(
flatten_axis
,
perm_shape
);
odata
+=
flat_shape
[
1
];
}
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionKernel
<
float
>
,
ops
::
TransposeFlattenConcatFusionKernel
<
double
>
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace
paddle
{
namespace
operators
{
inline
std
::
vector
<
int32_t
>
GetPermuteShape
(
const
std
::
vector
<
int
>&
axis
,
const
framework
::
DDim
&
in_dims
)
{
std
::
vector
<
int32_t
>
out_dims
(
in_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
out_dims
[
i
]
=
in_dims
[
axis
[
i
]];
}
return
out_dims
;
}
inline
std
::
vector
<
int32_t
>
GetFlattenShape
(
const
int
axis
,
const
std
::
vector
<
int
>&
in_dims
)
{
int64_t
outer
=
1
,
inner
=
1
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_dims
.
size
());
++
i
)
{
if
(
i
<
axis
)
{
outer
*=
in_dims
[
i
];
}
else
{
inner
*=
in_dims
[
i
];
}
}
std
::
vector
<
int32_t
>
out_shape
(
2
);
out_shape
[
0
]
=
outer
;
out_shape
[
1
]
=
inner
;
return
out_shape
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/interpolate_op.cc
浏览文件 @
7e4bd695
...
@@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"out_h"
,
"output height of interpolate op."
);
AddAttr
<
int
>
(
"out_h"
,
"output height of interpolate op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of interpolate op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of interpolate op."
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"interp_method"
,
"interp_method"
,
"(string, default
\"
bilinear
\"
), interpolation "
"(string), interpolation
method, can be
\"
bilinear
\"
for "
"
method, can be
\"
bilinear
\"
for "
"bilinear interpolation and
\"
nearest
\"
for nearest "
"bilinear interpolation and
\"
nearest
\"
for nearest "
"neighbor interpolation."
);
"neighbor interpolation."
)
.
SetDefault
(
"bilinear"
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
This operator samples input X to given output shape by using specified
This operator samples input X to given output shape by using specified
interpolation method, the interpolation methods can be \"nearest\"
interpolation method, the interpolation methods can be \"nearest\"
...
@@ -132,11 +133,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
...
@@ -132,11 +133,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
interpolate
,
ops
::
InterpolateOp
,
ops
::
InterpolateOpMaker
,
REGISTER_OPERATOR
(
bilinear_interp
,
ops
::
InterpolateOp
,
ops
::
InterpolateOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
interpolate_grad
,
ops
::
InterpolateOpGrad
);
REGISTER_OPERATOR
(
bilinear_interp_grad
,
ops
::
InterpolateOpGrad
);
REGISTER_OP_CPU_KERNEL
(
interpolate
,
ops
::
InterpolateKernel
<
float
>
,
REGISTER_OPERATOR
(
nearest_interp
,
ops
::
InterpolateOp
,
ops
::
InterpolateOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
nearest_interp_grad
,
ops
::
InterpolateOpGrad
);
REGISTER_OP_CPU_KERNEL
(
bilinear_interp
,
ops
::
InterpolateKernel
<
float
>
,
ops
::
InterpolateKernel
<
double
>
,
ops
::
InterpolateKernel
<
uint8_t
>
);
REGISTER_OP_CPU_KERNEL
(
bilinear_interp_grad
,
ops
::
InterpolateGradKernel
<
float
>
,
ops
::
InterpolateGradKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
nearest_interp
,
ops
::
InterpolateKernel
<
float
>
,
ops
::
InterpolateKernel
<
double
>
,
ops
::
InterpolateKernel
<
double
>
,
ops
::
InterpolateKernel
<
uint8_t
>
);
ops
::
InterpolateKernel
<
uint8_t
>
);
REGISTER_OP_CPU_KERNEL
(
interpolate
_grad
,
ops
::
InterpolateGradKernel
<
float
>
,
REGISTER_OP_CPU_KERNEL
(
nearest_interp
_grad
,
ops
::
InterpolateGradKernel
<
float
>
,
ops
::
InterpolateGradKernel
<
double
>
);
ops
::
InterpolateGradKernel
<
double
>
);
paddle/fluid/operators/interpolate_op.cu
浏览文件 @
7e4bd695
...
@@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
interpolate
,
ops
::
InterpolateOpCUDAKernel
<
float
>
,
REGISTER_OP_CUDA_KERNEL
(
bilinear_interp
,
ops
::
InterpolateOpCUDAKernel
<
float
>
,
ops
::
InterpolateOpCUDAKernel
<
double
>
,
ops
::
InterpolateOpCUDAKernel
<
double
>
,
ops
::
InterpolateOpCUDAKernel
<
int
>
);
ops
::
InterpolateOpCUDAKernel
<
int
>
);
REGISTER_OP_CUDA_KERNEL
(
interpolate_grad
,
REGISTER_OP_CUDA_KERNEL
(
bilinear_interp_grad
,
ops
::
InterpolateGradOpCUDAKernel
<
float
>
,
ops
::
InterpolateGradOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
nearest_interp
,
ops
::
InterpolateOpCUDAKernel
<
float
>
,
ops
::
InterpolateOpCUDAKernel
<
double
>
,
ops
::
InterpolateOpCUDAKernel
<
int
>
);
REGISTER_OP_CUDA_KERNEL
(
nearest_interp_grad
,
ops
::
InterpolateGradOpCUDAKernel
<
float
>
,
ops
::
InterpolateGradOpCUDAKernel
<
float
>
,
ops
::
InterpolateGradOpCUDAKernel
<
double
>
);
ops
::
InterpolateGradOpCUDAKernel
<
double
>
);
paddle/fluid/operators/lookup_sparse_table_op.cc
浏览文件 @
7e4bd695
...
@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
...
@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
framework
::
proto
::
VarType
::
FP32
,
framework
::
proto
::
VarType
::
FP32
,
"The sparse table only support FP32"
);
"The sparse table only support FP32"
);
w_t
->
Get
(
ids_t
,
out_t
,
true
,
is_test
);
w_t
->
Get
(
ids_t
,
out_t
,
true
,
is_test
);
out_t
->
set_lod
(
ids_t
.
lod
());
}
}
};
};
...
...
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
7e4bd695
...
@@ -16,6 +16,9 @@
...
@@ -16,6 +16,9 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/gpu_info.h"
DECLARE_bool
(
enable_cublas_tensor_op_math
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -42,11 +45,44 @@ struct CUBlas<float> {
...
@@ -42,11 +45,44 @@ struct CUBlas<float> {
}
}
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
static
void
GEMM_
STRIDED_
BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
args
...));
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
args
...));
#else
#else
PADDLE_THROW
(
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
PADDLE_THROW
(
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
platform
::
TensorCoreAvailable
()
?
"True"
:
"False"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmEx
(
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
#else
PADDLE_THROW
(
"cublasSgemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
#endif
}
}
};
};
...
@@ -69,13 +105,18 @@ struct CUBlas<double> {
...
@@ -69,13 +105,18 @@ struct CUBlas<double> {
}
}
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
static
void
GEMM_
STRIDED_
BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
args
...));
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
args
...));
#else
#else
PADDLE_THROW
(
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
PADDLE_THROW
(
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
#endif
}
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
ARGS
...
args
)
{
PADDLE_THROW
(
"Currently there are not cublasDgemmEx."
);
}
};
};
template
<
>
template
<
>
...
@@ -96,10 +137,12 @@ struct CUBlas<platform::float16> {
...
@@ -96,10 +137,12 @@ struct CUBlas<platform::float16> {
reinterpret_cast
<
__half
*>
(
C
),
ldc
));
reinterpret_cast
<
__half
*>
(
C
),
ldc
));
}
}
static
void
GEMM_BATCH
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
static
void
GEMM_STRIDED_BATCH
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float16
*
alpha
,
const
float16
*
A
,
int
lda
,
const
float16
*
alpha
,
const
float16
*
A
,
long
long
int
strideA
,
const
float16
*
B
,
// NOLINT
int
lda
,
long
long
int
strideA
,
// NOLINT
const
float16
*
B
,
// NOLINT
int
ldb
,
long
long
int
strideB
,
// NOLINT
int
ldb
,
long
long
int
strideB
,
// NOLINT
const
float16
*
beta
,
float16
*
C
,
int
ldc
,
const
float16
*
beta
,
float16
*
C
,
int
ldc
,
long
long
int
strideC
,
// NOLINT
long
long
int
strideC
,
// NOLINT
...
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
...
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
ldc
,
strideC
,
batchCount
));
ldc
,
strideC
,
batchCount
));
#else
#else
PADDLE_THROW
(
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
PADDLE_THROW
(
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
platform
::
TensorCoreAvailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmEx
(
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
#else
PADDLE_THROW
(
"cublasGemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
#endif
}
}
};
};
...
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
cublasOperation_t
cuTransB
=
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
#if CUDA_VERSION >= 8000
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
}
template
<
>
template
<
>
...
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
...
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
"cublas fp16 gemm requires GPU compute capability >= 53"
);
"cublas fp16 gemm requires GPU compute capability >= 53"
);
#if CUDA_VERSION >= 8000
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 9000
if
(
context_
.
GetComputeCapability
()
>=
70
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
context_
.
cublas_handle
(),
CUBLAS_TENSOR_OP_MATH
));
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
context_
.
cublas_handle
(),
CUBLAS_DEFAULT_MATH
));
}
#endif // CUDA_VERSION >= 9000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
// using tensor cores in volta GPUs.
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmEx
(
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUBlas
<
platform
::
float16
>::
GEMM_EX
(
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_
32F
,
algo
)
);
CUDA_R_
16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
...
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
...
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
// the cblas convention.
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
);
}
else
{
#endif // CUDA_VERSION >= 8000
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
int
lda
,
const
platform
::
float16
*
B
,
int
ldb
,
platform
::
float16
beta
,
platform
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
}
}
template
<
>
template
<
>
...
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
const
int64_t
strideC
=
M
*
N
;
CUBlas
<
T
>::
GEMM_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
#if CUDA_VERSION >= 9010
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
strideC
,
batchCount
);
auto
cublas_call
=
[
&
]()
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
platform
::
TensorCoreAvailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
strideB
,
A
,
CUDA_R_32F
,
lda
,
strideA
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
,
strideC
,
batchCount
,
CUDA_R_32F
,
algo
));
};
auto
&
dev_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
dev_ctx
.
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
}
else
{
#endif // CUDA_VERSION >= 9010
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
#if CUDA_VERSION >= 9010
}
#endif // CUDA_VERSION >= 9010
}
}
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/cpu_vec_test.cc
浏览文件 @
7e4bd695
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <sys/time.h>
#include <cmath>
#include <cmath>
#include <cstring>
#include <cstring>
#include <random>
#include <random>
...
@@ -22,6 +21,7 @@ limitations under the License. */
...
@@ -22,6 +21,7 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/port.h"
inline
double
GetCurrentUS
()
{
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
struct
timeval
time
;
...
...
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
7e4bd695
...
@@ -17,8 +17,6 @@ limitations under the License. */
...
@@ -17,8 +17,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
...
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
if (FLAGS_paddle_num_threads > 1)
#pragma omp parallel for
#endif
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
T
*
dst
=
Y
+
i
*
N
;
...
...
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
7e4bd695
...
@@ -14,9 +14,9 @@ limitations under the License. */
...
@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <sys/time.h>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/port.h"
template
<
typename
DeviceContext
,
typename
Place
>
template
<
typename
DeviceContext
,
typename
Place
>
void
testIm2col
()
{
void
testIm2col
()
{
...
...
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
7e4bd695
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_code.h"
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
#include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
namespace
paddle
{
namespace
paddle
{
...
@@ -139,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) {
...
@@ -139,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) {
}
}
void
VActJitCode
::
generate
()
{
void
VActJitCode
::
generate
()
{
xmm_t
xmm_zero
=
xmm_t
(
2
);
ymm_t
ymm_zero
=
ymm_t
(
2
);
if
(
type_
==
operand_type
::
relu
)
{
vxorps
(
ymm_zero
,
ymm_zero
,
ymm_zero
);
}
int
offset
=
0
;
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
num_
/
YMM_FLOAT_BLOCK
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
/
YMM_FLOAT_BLOCK
;
++
i
)
{
vmovups
(
ymm_src
,
ptr
[
param1
+
offset
]);
vmovups
(
ymm_src
,
ptr
[
param1
+
offset
]);
switch
(
type_
)
{
act
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
type_
);
case
operand_type
::
relu
:
relu_jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
ymm_zero
);
break
;
case
operand_type
::
exp
:
exp_jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
identity
:
break
;
default:
break
;
}
vmovups
(
ptr
[
param2
+
offset
],
ymm_dst
);
vmovups
(
ptr
[
param2
+
offset
],
ymm_dst
);
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
}
...
@@ -181,22 +160,7 @@ void VActJitCode::generate() {
...
@@ -181,22 +160,7 @@ void VActJitCode::generate() {
block
=
1
;
block
=
1
;
vmovss
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmovss
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
}
switch
(
type_
)
{
act
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
type_
);
case
operand_type
::
relu
:
relu_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
xmm_zero
);
break
;
case
operand_type
::
exp
:
exp_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
default:
break
;
}
if
(
rest
>=
4
)
{
if
(
rest
>=
4
)
{
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
}
else
if
(
rest
>=
2
)
{
...
@@ -210,6 +174,158 @@ void VActJitCode::generate() {
...
@@ -210,6 +174,158 @@ void VActJitCode::generate() {
ret
();
ret
();
}
}
bool
LSTMJitCode
::
init
(
int
d
)
{
return
MayIUse
(
avx
)
&&
d
%
8
==
0
;
}
void
LSTMJitCode
::
generate
()
{
if
(
use_peephole_
)
{
preCode
();
}
reg64_t
reg_ptr_gates
=
rax
;
reg64_t
reg_ptr_ct_1
=
r9
;
reg64_t
reg_ptr_ct
=
r10
;
reg64_t
reg_ptr_ht
=
r11
;
reg64_t
reg_ptr_wp
=
r12
;
mov
(
reg_ptr_gates
,
ptr
[
param1
+
offsetof
(
lstm_t
,
gates
)]);
mov
(
reg_ptr_ct_1
,
ptr
[
param1
+
offsetof
(
lstm_t
,
ct_1
)]);
mov
(
reg_ptr_ct
,
ptr
[
param1
+
offsetof
(
lstm_t
,
ct
)]);
mov
(
reg_ptr_ht
,
ptr
[
param1
+
offsetof
(
lstm_t
,
ht
)]);
if
(
use_peephole_
)
{
mov
(
reg_ptr_wp
,
ptr
[
param1
+
offsetof
(
lstm_t
,
wp
)]);
}
int
offset
=
0
;
int
d
=
num_
*
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
num_
/
YMM_FLOAT_BLOCK
;
++
i
)
{
/* gates: W_ch, W_ih, W_fh, W_oh */
ymm_t
ymm_c
=
ymm_t
(
0
);
ymm_t
ymm_i
=
ymm_t
(
1
);
ymm_t
ymm_f
=
ymm_t
(
2
);
ymm_t
ymm_o
=
ymm_t
(
3
);
ymm_t
ymm_ct_1
=
ymm_t
(
4
);
ymm_t
ymm_wp0
=
ymm_t
(
5
);
ymm_t
ymm_wp1
=
ymm_t
(
6
);
ymm_t
ymm_wp2
=
ymm_t
(
7
);
vmovups
(
ymm_c
,
ptr
[
reg_ptr_gates
+
offset
]);
vmovups
(
ymm_i
,
ptr
[
reg_ptr_gates
+
offset
+
d
]);
vmovups
(
ymm_f
,
ptr
[
reg_ptr_gates
+
offset
+
2
*
d
]);
vmovups
(
ymm_o
,
ptr
[
reg_ptr_gates
+
offset
+
3
*
d
]);
if
(
!
compute_c1h1_
)
{
vmovups
(
ymm_ct_1
,
ptr
[
reg_ptr_ct_1
+
offset
]);
}
if
(
use_peephole_
)
{
vmovups
(
ymm_wp0
,
ptr
[
reg_ptr_wp
+
offset
]);
vmovups
(
ymm_wp1
,
ptr
[
reg_ptr_wp
+
offset
+
d
]);
vmovups
(
ymm_wp2
,
ptr
[
reg_ptr_wp
+
offset
+
2
*
d
]);
}
/* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
// act_cand(c)
act
<
ymm_t
>
(
ymm_c
,
ymm_c
,
act_cand_
);
// act_gate(i) or act_gate(ct_1 * wp0 + i)
if
(
!
compute_c1h1_
&&
use_peephole_
)
{
vmulps
(
ymm_wp0
,
ymm_ct_1
,
ymm_wp0
);
vaddps
(
ymm_i
,
ymm_i
,
ymm_wp0
);
}
act
<
ymm_t
>
(
ymm_i
,
ymm_i
,
act_gate_
);
vmulps
(
ymm_c
,
ymm_c
,
ymm_i
);
if
(
!
compute_c1h1_
)
{
// act_gate(f) or act_gate(ct_1 * wp1 + f)
if
(
use_peephole_
)
{
vmulps
(
ymm_wp1
,
ymm_ct_1
,
ymm_wp1
);
vaddps
(
ymm_f
,
ymm_f
,
ymm_wp1
);
}
act
<
ymm_t
>
(
ymm_f
,
ymm_f
,
act_gate_
);
// ct
vmulps
(
ymm_f
,
ymm_f
,
ymm_ct_1
);
vaddps
(
ymm_f
,
ymm_f
,
ymm_c
);
}
/* H_t = act_cell(C_t) * act_gate(o) */
// act_cell(C_t)
ymm_t
ymm_ct
=
compute_c1h1_
?
ymm_c
:
ymm_f
;
ymm_t
ymm_tmp
=
ymm_i
;
act
<
ymm_t
>
(
ymm_tmp
,
ymm_ct
,
act_cell_
);
// act_gate(o) or act_gate(ct * wp2 + o)
if
(
use_peephole_
)
{
vmulps
(
ymm_wp2
,
ymm_ct
,
ymm_wp2
);
vaddps
(
ymm_o
,
ymm_o
,
ymm_wp2
);
}
act
<
ymm_t
>
(
ymm_o
,
ymm_o
,
act_gate_
);
// ht
vmulps
(
ymm_o
,
ymm_o
,
ymm_tmp
);
// save ct and ht
vmovups
(
ptr
[
reg_ptr_ct
+
offset
],
ymm_ct
);
vmovups
(
ptr
[
reg_ptr_ht
+
offset
],
ymm_o
);
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
if
(
use_peephole_
)
{
postCode
();
}
else
{
ret
();
}
}
bool
GRUJitCode
::
init
(
int
d
)
{
return
MayIUse
(
avx
)
&&
d
%
8
==
0
;
}
void
GRUJitCode
::
generate
()
{
reg64_t
reg_ptr_gates
=
rax
;
reg64_t
reg_ptr_ht_1
=
r9
;
reg64_t
reg_ptr_ht
=
r10
;
mov
(
reg_ptr_gates
,
ptr
[
param1
+
offsetof
(
gru_t
,
gates
)]);
mov
(
reg_ptr_ht_1
,
ptr
[
param1
+
offsetof
(
gru_t
,
ht_1
)]);
mov
(
reg_ptr_ht
,
ptr
[
param1
+
offsetof
(
gru_t
,
ht
)]);
ymm_t
ymm_one
=
ymm_t
(
0
);
if
(
id_
==
2
)
{
reg64_t
reg_ptr_tmp
=
r11
;
mov
(
reg_ptr_tmp
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_one
,
ptr
[
reg_ptr_tmp
+
OFFSET_EXP_ONE
]);
}
int
offset
=
0
;
int
d
=
num_
*
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
num_
/
YMM_FLOAT_BLOCK
;
++
i
)
{
ymm_t
ymm_u
=
ymm_t
(
1
);
ymm_t
ymm_r
=
ymm_t
(
2
);
ymm_t
ymm_s
=
ymm_t
(
3
);
ymm_t
ymm_ht_1
=
ymm_t
(
4
);
// W: {W_update, W_reset; W_state}
if
(
id_
==
0
||
id_
==
2
)
{
vmovups
(
ymm_u
,
ptr
[
reg_ptr_gates
+
offset
]);
vmovups
(
ymm_s
,
ptr
[
reg_ptr_gates
+
offset
+
2
*
d
]);
}
if
(
id_
==
1
)
{
vmovups
(
ymm_r
,
ptr
[
reg_ptr_gates
+
offset
+
d
]);
}
if
(
id_
==
1
||
id_
==
2
)
{
vmovups
(
ymm_ht_1
,
ptr
[
reg_ptr_ht_1
+
offset
]);
}
if
(
id_
==
0
)
{
// ht = act_gate(u) * act_cand(s)
act
<
ymm_t
>
(
ymm_u
,
ymm_u
,
act_gate_
);
act
<
ymm_t
>
(
ymm_s
,
ymm_s
,
act_cand_
);
vmulps
(
ymm_s
,
ymm_s
,
ymm_u
);
vmovups
(
ptr
[
reg_ptr_ht
+
offset
],
ymm_s
);
}
else
if
(
id_
==
1
)
{
// ht = act_gate(r) * ht_1
act
<
ymm_t
>
(
ymm_r
,
ymm_r
,
act_gate_
);
vmulps
(
ymm_r
,
ymm_r
,
ymm_ht_1
);
vmovups
(
ptr
[
reg_ptr_ht
+
offset
],
ymm_r
);
}
else
if
(
id_
==
2
)
{
// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
ymm_t
ymm_one_inner
=
ymm_t
(
ymm_one
.
getIdx
());
act
<
ymm_t
>
(
ymm_u
,
ymm_u
,
act_gate_
);
act
<
ymm_t
>
(
ymm_s
,
ymm_s
,
act_cand_
);
vmulps
(
ymm_s
,
ymm_s
,
ymm_u
);
vsubps
(
ymm_u
,
ymm_one_inner
,
ymm_u
);
vmulps
(
ymm_u
,
ymm_ht_1
,
ymm_u
);
vaddps
(
ymm_u
,
ymm_s
,
ymm_u
);
vmovups
(
ptr
[
reg_ptr_ht
+
offset
],
ymm_u
);
}
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
ret
();
}
}
// namespace gen
}
// namespace gen
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
7e4bd695
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/operators/math/jit_kernel_impl.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -46,14 +47,6 @@ extern const float exp_float_consts[];
...
@@ -46,14 +47,6 @@ extern const float exp_float_consts[];
extern
const
int
exp_int_0x7f
[];
extern
const
int
exp_int_0x7f
[];
extern
int
g_tmp_mem
[];
extern
int
g_tmp_mem
[];
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define ALIGN32 __attribute__((aligned(32)))
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define EXP_LOW -88.3762626647949f
...
@@ -176,31 +169,34 @@ class VActJitCode : public JitCode {
...
@@ -176,31 +169,34 @@ class VActJitCode : public JitCode {
protected:
protected:
// compute relu with ymm, xmm
// compute relu with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
relu_jmm
(
JMM
&
dst
,
JMM
&
src
,
JMM
&
zero
)
{
// NOLINT
void
relu_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
zero_idx
=
15
)
{
// NOLINT
JMM
zero
=
JMM
(
zero_idx
);
vxorps
(
zero
,
zero
,
zero
);
vmaxps
(
dst
,
src
,
zero
);
vmaxps
(
dst
,
src
,
zero
);
}
}
// compute exp with ymm, xmm
// compute exp with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
exp_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
void
exp_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
src_idx
=
11
,
int
fx_idx
=
12
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
int
fy_idx
=
13
,
int
mask_idx
=
14
,
int
tmp_idx
=
1
5
)
{
using
namespace
platform
::
jit
;
// NOLINT
using
namespace
platform
::
jit
;
// NOLINT
assert
(
src
.
getIdx
()
!=
dst
.
getIdx
());
// TODO(TJ): use enfore
// check all idx can not equal
// check all idx can not equal
JMM
jmm_src
=
JMM
(
src_idx
);
JMM
jmm_fx
=
JMM
(
fx_idx
);
JMM
jmm_fx
=
JMM
(
fx_idx
);
JMM
jmm_fy
=
JMM
(
fy_idx
);
JMM
jmm_fy
=
JMM
(
fy_idx
);
JMM
jmm_mask
=
JMM
(
mask_idx
);
JMM
jmm_mask
=
JMM
(
mask_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
push
(
reg_ptr_global
);
vmovaps
(
jmm_src
,
src
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vminps
(
src
,
src
,
jmm_tmp
);
vminps
(
jmm_src
,
jmm_
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
vmaxps
(
jmm_src
,
jmm_
src
,
jmm_tmp
);
// express exp(x) as exp(g + n*log(2))
// express exp(x) as exp(g + n*log(2))
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmulps
(
jmm_fx
,
src
,
jmm_tmp
);
vmulps
(
jmm_fx
,
jmm_
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vaddps
(
jmm_fx
,
jmm_fx
,
jmm_tmp
);
vaddps
(
jmm_fx
,
jmm_fx
,
jmm_tmp
);
vroundps
(
jmm_fy
,
jmm_fx
,
0x01
);
vroundps
(
jmm_fy
,
jmm_fx
,
0x01
);
...
@@ -214,21 +210,21 @@ class VActJitCode : public JitCode {
...
@@ -214,21 +210,21 @@ class VActJitCode : public JitCode {
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
JMM
ymm_z
=
JMM
(
jmm_mask
.
getIdx
());
JMM
ymm_z
=
JMM
(
jmm_mask
.
getIdx
());
vmulps
(
ymm_z
,
jmm_fx
,
jmm_tmp
);
vmulps
(
ymm_z
,
jmm_fx
,
jmm_tmp
);
vsubps
(
src
,
src
,
jmm_fy
);
vsubps
(
jmm_src
,
jmm_
src
,
jmm_fy
);
vsubps
(
src
,
src
,
ymm_z
);
vsubps
(
jmm_src
,
jmm_
src
,
ymm_z
);
vmulps
(
ymm_z
,
src
,
src
);
vmulps
(
ymm_z
,
jmm_src
,
jmm_
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmulps
(
dst
,
src
,
jmm_tmp
);
vmulps
(
dst
,
jmm_
src
,
jmm_tmp
);
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vaddps
(
dst
,
dst
,
jmm_tmp
);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
src
);
vmulps
(
dst
,
dst
,
jmm_
src
);
}
}
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
ymm_z
);
vmulps
(
dst
,
dst
,
ymm_z
);
vaddps
(
dst
,
dst
,
src
);
vaddps
(
dst
,
dst
,
jmm_
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vaddps
(
dst
,
dst
,
jmm_tmp
);
// build 2^n
// build 2^n
...
@@ -265,20 +261,23 @@ class VActJitCode : public JitCode {
...
@@ -265,20 +261,23 @@ class VActJitCode : public JitCode {
// compute sigmoid with ymm, xmm
// compute sigmoid with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
sigmoid_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
// NOLINT
void
sigmoid_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
src_idx
=
11
,
// NOLINT
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
int
fx_idx
=
12
,
int
fy_idx
=
13
,
int
mask_idx
=
14
,
int
tmp_idx
=
15
)
{
// y = 1 / (1 + e^-x)
// y = 1 / (1 + e^-x)
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_src
=
JMM
(
src_idx
);
reg64_t
reg_ptr_global
=
rax
;
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
push
(
reg_ptr_global
);
vmovaps
(
jmm_src
,
src
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vminps
(
src
,
src
,
jmm_tmp
);
vminps
(
jmm_src
,
jmm_
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
vmaxps
(
jmm_src
,
jmm_
src
,
jmm_tmp
);
vxorps
(
jmm_tmp
,
jmm_tmp
,
jmm_tmp
);
vxorps
(
jmm_tmp
,
jmm_tmp
,
jmm_tmp
);
vsubps
(
src
,
jmm_tmp
,
src
);
vsubps
(
jmm_src
,
jmm_tmp
,
jmm_
src
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
exp_jmm
<
JMM
>
(
dst
,
jmm_src
,
src_idx
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vdivps
(
dst
,
jmm_tmp
,
dst
);
vdivps
(
dst
,
jmm_tmp
,
dst
);
...
@@ -287,19 +286,22 @@ class VActJitCode : public JitCode {
...
@@ -287,19 +286,22 @@ class VActJitCode : public JitCode {
// compute tanh with ymm, xmm
// compute tanh with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
tanh_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
void
tanh_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
src_idx
=
11
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
int
fx_idx
=
12
,
int
fy_idx
=
13
,
int
mask_idx
=
14
,
int
tmp_idx
=
15
)
{
// y = 2 / (1 + e^(-2x)) - 1
// y = 2 / (1 + e^(-2x)) - 1
JMM
jmm_src
=
JMM
(
src_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_zero
=
JMM
(
mask_idx
);
JMM
jmm_zero
=
JMM
(
mask_idx
);
reg64_t
reg_ptr_global
=
rax
;
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
push
(
reg_ptr_global
);
vmovaps
(
jmm_src
,
src
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vxorps
(
jmm_zero
,
jmm_zero
,
jmm_zero
);
vxorps
(
jmm_zero
,
jmm_zero
,
jmm_zero
);
vsubps
(
jmm_tmp
,
jmm_zero
,
jmm_tmp
);
vsubps
(
jmm_tmp
,
jmm_zero
,
jmm_tmp
);
vmulps
(
src
,
src
,
jmm_tmp
);
vmulps
(
jmm_src
,
jmm_
src
,
jmm_tmp
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
exp_jmm
<
JMM
>
(
dst
,
jmm_src
,
src_idx
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
...
@@ -309,6 +311,30 @@ class VActJitCode : public JitCode {
...
@@ -309,6 +311,30 @@ class VActJitCode : public JitCode {
pop
(
reg_ptr_global
);
pop
(
reg_ptr_global
);
}
}
template
<
typename
JMM
>
void
act
(
JMM
&
dst
,
JMM
&
src
,
operand_type
type
)
{
// NOLINT
// use 11~15
switch
(
type
)
{
case
operand_type
::
relu
:
relu_jmm
<
JMM
>
(
dst
,
src
,
15
);
break
;
case
operand_type
::
exp
:
exp_jmm
<
JMM
>
(
dst
,
src
,
11
,
12
,
13
,
14
,
15
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_jmm
<
JMM
>
(
dst
,
src
,
11
,
12
,
13
,
14
,
15
);
break
;
case
operand_type
::
tanh
:
tanh_jmm
<
JMM
>
(
dst
,
src
,
11
,
12
,
13
,
14
,
15
);
break
;
case
operand_type
::
identity
:
break
;
default:
// throw error
break
;
}
}
protected:
protected:
int
num_
;
int
num_
;
operand_type
type_
;
operand_type
type_
;
...
@@ -322,6 +348,148 @@ class VActJitCode : public JitCode {
...
@@ -322,6 +348,148 @@ class VActJitCode : public JitCode {
ymm_t
ymm_dst
=
ymm_t
(
1
);
ymm_t
ymm_dst
=
ymm_t
(
1
);
};
};
class
LSTMJitCode
:
public
VActJitCode
{
public:
const
char
*
name
()
const
override
{
std
::
string
base
=
"LSTMJitCode"
;
if
(
use_peephole_
)
{
base
+=
"_Peephole"
;
}
if
(
compute_c1h1_
)
{
base
+=
"_C1H1"
;
}
auto
AddTypeStr
=
[
&
](
operand_type
type
)
{
switch
(
type
)
{
case
operand_type
::
relu
:
base
+=
"_Relu"
;
break
;
case
operand_type
::
exp
:
base
+=
"_Exp"
;
break
;
case
operand_type
::
sigmoid
:
base
+=
"_Sigmoid"
;
break
;
case
operand_type
::
tanh
:
base
+=
"_Tanh"
;
break
;
case
operand_type
::
identity
:
base
+=
"_Identity"
;
break
;
default:
break
;
}
};
AddTypeStr
(
act_gate_
);
AddTypeStr
(
act_cand_
);
AddTypeStr
(
act_cell_
);
return
base
.
c_str
();
}
explicit
LSTMJitCode
(
bool
compute_c1h1
,
const
lstm_attr_t
&
attr
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
VActJitCode
(
attr
.
d
,
operand_type
::
sigmoid
/* this is bugy*/
,
code_size
,
code_ptr
),
compute_c1h1_
(
compute_c1h1
)
{
auto
typeExchange
=
[](
const
std
::
string
&
type
)
->
gen
::
operand_type
{
if
(
type
==
"sigmoid"
)
{
return
operand_type
::
sigmoid
;
}
else
if
(
type
==
"relu"
)
{
return
operand_type
::
relu
;
}
else
if
(
type
==
"tanh"
)
{
return
operand_type
::
tanh
;
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
operand_type
::
identity
;
}
// else throw error
return
operand_type
::
identity
;
};
num_
=
attr
.
d
;
use_peephole_
=
attr
.
use_peephole
;
act_gate_
=
typeExchange
(
attr
.
act_gate
);
act_cand_
=
typeExchange
(
attr
.
act_cand
);
act_cell_
=
typeExchange
(
attr
.
act_cell
);
}
static
bool
init
(
int
d
);
void
generate
()
override
;
protected:
int
num_
;
bool
compute_c1h1_
;
bool
use_peephole_
;
operand_type
act_gate_
;
operand_type
act_cand_
;
operand_type
act_cell_
;
reg64_t
param1
{
abi_param1
};
};
class
GRUJitCode
:
public
VActJitCode
{
public:
const
char
*
name
()
const
override
{
std
::
string
base
=
"GRUJitCode"
;
if
(
id_
==
0
)
{
base
+=
"_H1"
;
}
else
if
(
id_
==
1
)
{
base
+=
"_HtPart1"
;
}
else
if
(
id_
==
2
)
{
base
+=
"_HtPart2"
;
}
auto
AddTypeStr
=
[
&
](
operand_type
type
)
{
switch
(
type
)
{
case
operand_type
::
relu
:
base
+=
"_Relu"
;
break
;
case
operand_type
::
exp
:
base
+=
"_Exp"
;
break
;
case
operand_type
::
sigmoid
:
base
+=
"_Sigmoid"
;
break
;
case
operand_type
::
tanh
:
base
+=
"_Tanh"
;
break
;
case
operand_type
::
identity
:
base
+=
"_Identity"
;
break
;
default:
break
;
}
};
AddTypeStr
(
act_gate_
);
AddTypeStr
(
act_cand_
);
return
base
.
c_str
();
}
explicit
GRUJitCode
(
int
id
,
const
gru_attr_t
&
attr
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
VActJitCode
(
attr
.
d
,
operand_type
::
sigmoid
/* this is bugy*/
,
code_size
,
code_ptr
),
id_
(
id
)
{
auto
typeExchange
=
[](
const
std
::
string
&
type
)
->
gen
::
operand_type
{
if
(
type
==
"sigmoid"
)
{
return
operand_type
::
sigmoid
;
}
else
if
(
type
==
"relu"
)
{
return
operand_type
::
relu
;
}
else
if
(
type
==
"tanh"
)
{
return
operand_type
::
tanh
;
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
operand_type
::
identity
;
}
// else throw error
return
operand_type
::
identity
;
};
num_
=
attr
.
d
;
act_gate_
=
typeExchange
(
attr
.
act_gate
);
act_cand_
=
typeExchange
(
attr
.
act_cand
);
}
static
bool
init
(
int
d
);
void
generate
()
override
;
protected:
int
id_
;
int
num_
;
operand_type
act_gate_
;
operand_type
act_cand_
;
reg64_t
param1
{
abi_param1
};
};
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
struct
EltwiseMulnChw16cNC
:
public
Xbyak
::
CodeGenerator
{
struct
EltwiseMulnChw16cNC
:
public
Xbyak
::
CodeGenerator
{
explicit
EltwiseMulnChw16cNC
(
size_t
code_size
=
256
*
1024
)
explicit
EltwiseMulnChw16cNC
(
size_t
code_size
=
256
*
1024
)
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
7e4bd695
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <memory> // for shared_ptr
#include <memory> // for shared_ptr
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include "paddle/fluid/operators/math/jit_kernel_impl.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
...
@@ -26,14 +27,7 @@ namespace operators {
...
@@ -26,14 +27,7 @@ namespace operators {
namespace
math
{
namespace
math
{
namespace
jitkernel
{
namespace
jitkernel
{
// TODO(TJ): move these to some proper place
// TODO(TJ): remove me
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
typedef
enum
{
kLT8
,
kEQ8
,
kGT8LT16
,
kEQ16
,
kGT16
}
jit_block
;
typedef
enum
{
kLT8
,
kEQ8
,
kGT8LT16
,
kEQ16
,
kGT16
}
jit_block
;
class
Kernel
{
class
Kernel
{
...
@@ -128,24 +122,18 @@ class VTanhKernel : public VActKernel<T> {};
...
@@ -128,24 +122,18 @@ class VTanhKernel : public VActKernel<T> {};
template
<
typename
T
>
template
<
typename
T
>
class
LSTMKernel
:
public
Kernel
{
class
LSTMKernel
:
public
Kernel
{
public:
public:
virtual
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
/* below only used in peephole*/
const
T
*
wp_data
=
nullptr
,
T
*
checked
=
nullptr
)
const
=
0
;
// compute c1 and h1 without c0 or h0
// compute c1 and h1 without c0 or h0
virtual
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
void
(
*
ComputeC1H1
)(
lstm_t
*
,
const
lstm_attr_t
*
);
/* below only used in peephole*/
void
(
*
ComputeCtHt
)(
lstm_t
*
,
const
lstm_attr_t
*
);
const
T
*
wp_data
=
nullptr
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
GRUKernel
:
public
Kernel
{
class
GRUKernel
:
public
Kernel
{
public:
public:
// compute h1 without h0
// compute h1 without h0
v
irtual
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
=
0
;
v
oid
(
*
ComputeH1
)(
gru_t
*
,
const
gru_attr_t
*
)
;
v
irtual
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
=
0
;
v
oid
(
*
ComputeHtPart1
)(
gru_t
*
,
const
gru_attr_t
*
)
;
v
irtual
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
=
0
;
v
oid
(
*
ComputeHtPart2
)(
gru_t
*
,
const
gru_attr_t
*
)
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
7e4bd695
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -31,49 +32,6 @@ namespace math {
...
@@ -31,49 +32,6 @@ namespace math {
namespace
jitkernel
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
namespace
jit
=
platform
::
jit
;
template
<
typename
T
>
void
VMulRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
template
<
typename
T
>
void
VAddRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
template
<
typename
T
>
void
VAddReluRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
template
<
typename
T
>
void
VScalRefer
(
const
T
*
a
,
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
[
0
]
*
x
[
i
];
}
}
template
<
typename
T
>
void
VAddBiasRefer
(
const
T
*
a
,
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
[
0
]
+
x
[
i
];
}
}
template
<
typename
T
>
void
VReluRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
template
<
typename
T
>
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
...
@@ -109,7 +67,7 @@ void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
...
@@ -109,7 +67,7 @@ void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
if
(
x
==
y
)
{
if
(
x
==
y
)
{
platform
::
dynload
::
cblas_sscal
(
n
,
*
a
,
y
,
1
);
platform
::
dynload
::
cblas_sscal
(
n
,
*
a
,
y
,
1
);
}
else
{
}
else
{
VScalRefer
<
float
>
(
a
,
x
,
y
,
n
);
refer
::
VScal
<
float
>
(
a
,
x
,
y
,
n
);
}
}
}
}
...
@@ -118,7 +76,7 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
...
@@ -118,7 +76,7 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
if
(
x
==
y
)
{
if
(
x
==
y
)
{
platform
::
dynload
::
cblas_dscal
(
n
,
*
a
,
y
,
1
);
platform
::
dynload
::
cblas_dscal
(
n
,
*
a
,
y
,
1
);
}
else
{
}
else
{
VScalRefer
<
double
>
(
a
,
x
,
y
,
n
);
refer
::
VScal
<
double
>
(
a
,
x
,
y
,
n
);
}
}
}
}
...
@@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel<T> {
...
@@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VMulRefer
<
T
>
;
this
->
Compute
=
refer
::
VMul
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel<T> {
...
@@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VAddRefer
<
T
>
;
this
->
Compute
=
refer
::
VAdd
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -280,7 +238,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
...
@@ -280,7 +238,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VAddReluRefer
<
T
>
;
this
->
Compute
=
refer
::
VAddRelu
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -318,7 +276,7 @@ class VScalKernelImpl : public VScalKernel<T> {
...
@@ -318,7 +276,7 @@ class VScalKernelImpl : public VScalKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VScalRefer
<
T
>
;
this
->
Compute
=
refer
::
VScal
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -362,7 +320,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
...
@@ -362,7 +320,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
}
}
#endif
#endif
this
->
Compute
=
VAddBiasRefer
<
T
>
;
this
->
Compute
=
refer
::
VAddBias
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -396,7 +354,7 @@ class VReluKernelImpl : public VReluKernel<T> {
...
@@ -396,7 +354,7 @@ class VReluKernelImpl : public VReluKernel<T> {
}
}
#endif
#endif
this
->
Compute
=
VReluRefer
<
T
>
;
this
->
Compute
=
refer
::
VRelu
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -412,16 +370,13 @@ bool VReluKernelImpl<float>::useJIT(int d) {
...
@@ -412,16 +370,13 @@ bool VReluKernelImpl<float>::useJIT(int d) {
}
}
#endif
#endif
template
<
typename
T
>
inline
void
VIdentityRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{}
/* An empty JitKernel */
/* An empty JitKernel */
template
<
typename
T
>
template
<
typename
T
>
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
public:
public:
JITKERNEL_DECLARE_STATIC_FUNC
;
JITKERNEL_DECLARE_STATIC_FUNC
;
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
this
->
Compute
=
VIdentityRefer
<
T
>
;
this
->
Compute
=
refer
::
VIdentity
<
T
>
;
}
}
};
};
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
7e4bd695
...
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
...
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <cmath> // for exp
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_code.h"
...
@@ -25,48 +25,12 @@ limitations under the License. */
...
@@ -25,48 +25,12 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
namespace
jitkernel
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
namespace
jit
=
platform
::
jit
;
// TODO(TJ): move refer codes to one file
// Refer code only focus on correctness
template
<
typename
T
>
void
VExpRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
template
<
typename
T
>
void
VSigmoidRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{
// y = 1 / (1 + e^-x)
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
T
tmp
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
std
::
exp
(
-
tmp
));
}
}
template
<
typename
T
>
void
VTanhRefer
(
const
T
*
x
,
T
*
y
,
int
n
)
{
// y = 2 * sigmoid(2x) - 1
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
x
[
i
];
}
VSigmoidRefer
(
y
,
y
,
n
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
y
[
i
]
-
static_cast
<
T
>
(
1
);
}
}
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
// try to use MKL to speedup
// try to use MKL to speedup
template
<
typename
T
>
template
<
typename
T
>
...
@@ -129,7 +93,7 @@ class VExpKernelImpl : public VExpKernel<T> {
...
@@ -129,7 +93,7 @@ class VExpKernelImpl : public VExpKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VExpRefer
<
T
>
;
this
->
Compute
=
refer
::
VExp
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -182,7 +146,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
...
@@ -182,7 +146,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VSigmoidRefer
<
T
>
;
this
->
Compute
=
refer
::
VSigmoid
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -234,7 +198,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
...
@@ -234,7 +198,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
return
;
return
;
}
}
#endif
#endif
this
->
Compute
=
VTanhRefer
<
T
>
;
this
->
Compute
=
refer
::
VTanh
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
...
@@ -267,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel);
...
@@ -267,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel);
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
namespace
detail
{
#ifdef __AVX__
#define ALIGN32 __attribute__((aligned(32)))
#define _PS256_CONST(Name, Val) \
static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
Val, Val, Val, Val}
#define _PI256_CONST(Name, Val) \
static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
Val, Val, Val, Val}
_PI256_CONST
(
0x7f
,
0x7f
);
_PS256_CONST
(
one
,
1.
f
);
_PS256_CONST
(
0
p5
,
0.5
f
);
_PS256_CONST
(
exp_hi
,
88.3762626647949
f
);
_PS256_CONST
(
exp_lo
,
-
88.3762626647949
f
);
_PS256_CONST
(
cephes_LOG2EF
,
1.44269504088896341
);
_PS256_CONST
(
cephes_exp_C1
,
0.693359375
);
_PS256_CONST
(
cephes_exp_C2
,
-
2.12194440e-4
);
_PS256_CONST
(
cephes_exp_p0
,
1.9875691500E-4
);
_PS256_CONST
(
cephes_exp_p1
,
1.3981999507E-3
);
_PS256_CONST
(
cephes_exp_p2
,
8.3334519073E-3
);
_PS256_CONST
(
cephes_exp_p3
,
4.1665795894E-2
);
_PS256_CONST
(
cephes_exp_p4
,
1.6666665459E-1
);
_PS256_CONST
(
cephes_exp_p5
,
5.0000001201E-1
);
typedef
union
imm_xmm_union
{
__m256i
imm
;
__m128i
xmm
[
2
];
}
imm_xmm_union
;
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
{ \
imm_xmm_union u ALIGN32; \
u.imm = imm_; \
xmm0_ = u.xmm[0]; \
xmm1_ = u.xmm[1]; \
}
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
{ \
imm_xmm_union u ALIGN32; \
u.xmm[0] = xmm0_; \
u.xmm[1] = xmm1_; \
imm_ = u.imm; \
}
#define AVX2_BITOP_USING_SSE2(fn) \
static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
/* use SSE2 to perform the bitop AVX2 */
\
__m128i x1, x2; \
__m256i ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
x1 = _mm_##fn(x1, y); \
x2 = _mm_##fn(x2, y); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return ret; \
}
#define AVX2_INTOP_USING_SSE2(fn) \
static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
/* use SSE2 to perform the AVX2 integer operation */
\
__m128i x1, x2; \
__m128i y1, y2; \
__m256i ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
COPY_IMM_TO_XMM(y, y1, y2); \
x1 = _mm_##fn(x1, y1); \
x2 = _mm_##fn(x2, y2); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return ret; \
}
AVX2_BITOP_USING_SSE2
(
slli_epi32
);
AVX2_INTOP_USING_SSE2
(
add_epi32
);
#define AVXEXP_BASE \
__m256 tmp = _mm256_setzero_ps(), fx; \
__m256 one = *reinterpret_cast<const __m256*>(_ps256_one); \
__m256i imm0; \
x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi)); \
x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo)); \
/* express exp(x) as exp(g + n*log(2)) */
\
fx = _mm256_mul_ps(x, \
*reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF)); \
fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5)); \
tmp = _mm256_floor_ps(fx); \
/* if greater, substract 1 */
\
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \
mask = _mm256_and_ps(mask, one); \
fx = _mm256_sub_ps(tmp, mask); \
tmp = _mm256_mul_ps(fx, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
__m256 z = _mm256_mul_ps( \
fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2)); \
x = _mm256_sub_ps(x, tmp); \
x = _mm256_sub_ps(x, z); \
z = _mm256_mul_ps(x, x); \
__m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5)); \
y = _mm256_mul_ps(y, z); \
y = _mm256_add_ps(y, x); \
y = _mm256_add_ps(y, one); \
/* build 2^n */
\
imm0 = _mm256_cvttps_epi32(fx)
__m256
ExpAVX
(
__m256
x
)
{
AVXEXP_BASE
;
// two AVX2 instructions using SSE2
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
reinterpret_cast
<
const
__m256i
*>
(
_pi256_0x7f
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
23
);
__m256
pow2n
=
_mm256_castsi256_ps
(
imm0
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
return
y
;
}
#endif
#ifdef __AVX2__
__m256
ExpAVX2
(
__m256
x
)
{
AVXEXP_BASE
;
// two AVX2 instructions
imm0
=
_mm256_add_epi32
(
imm0
,
*
reinterpret_cast
<
const
__m256i
*>
(
_pi256_0x7f
));
imm0
=
_mm256_slli_epi32
(
imm0
,
23
);
__m256
pow2n
=
_mm256_castsi256_ps
(
imm0
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
return
y
;
}
#endif
}
// namespace detail
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/math/jit_kernel_impl.h
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <type_traits>
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
typedef
struct
{
void
*
gates
;
// gates: W_ch, W_ih, W_fh, W_oh
const
void
*
ct_1
;
void
*
ct
;
void
*
ht
;
/* weight_peephole and checked data are only used in peephole*/
const
void
*
wp
{
nullptr
};
void
*
checked
{
nullptr
};
}
lstm_t
;
typedef
struct
{
void
*
gates
;
// gates: {W_update, W_reset; W_state}
const
void
*
ht_1
;
void
*
ht
;
}
gru_t
;
struct
rnn_attr_s
{
int
d
;
std
::
string
act_gate
,
act_cand
;
rnn_attr_s
()
=
default
;
rnn_attr_s
(
int
_d
,
const
std
::
string
&
_act_gate
,
const
std
::
string
&
_act_cand
)
:
d
(
_d
),
act_gate
(
_act_gate
),
act_cand
(
_act_cand
)
{}
};
struct
lstm_attr_s
:
public
rnn_attr_s
{
bool
use_peephole
;
std
::
string
act_cell
;
lstm_attr_s
()
=
default
;
lstm_attr_s
(
int
_d
,
const
std
::
string
&
_act_gate
,
const
std
::
string
&
_act_cand
,
const
std
::
string
&
_act_cell
,
bool
_use_peephole
=
false
)
:
rnn_attr_s
(
_d
,
_act_gate
,
_act_cand
),
use_peephole
(
_use_peephole
),
act_cell
(
_act_cell
)
{}
};
typedef
struct
rnn_attr_s
gru_attr_t
;
typedef
struct
lstm_attr_s
lstm_attr_t
;
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_macro.h
浏览文件 @
7e4bd695
...
@@ -82,10 +82,10 @@ namespace jitkernel {
...
@@ -82,10 +82,10 @@ namespace jitkernel {
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \
marco_declare, macro_find_key, macro_impl) \
marco_declare, macro_find_key, macro_impl) \
marco_define_name(ker_key, ker_class); \
marco_define_name(ker_key, ker_class); \
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float,
JITKERNEL_DECLARE,
\
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float,
marco_declare,
\
JITKERNEL_FIND_KEY, JITKERNEL_IMPL);
\
macro_find_key, macro_impl);
\
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double,
JITKERNEL_DECLARE,
\
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double,
marco_declare,
\
JITKERNEL_FIND_KEY, JITKERNEL_IMPL
)
macro_find_key, macro_impl
)
#define REGISTER_JITKERNEL(ker_key, ker_class) \
#define REGISTER_JITKERNEL(ker_key, ker_class) \
REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
...
...
paddle/fluid/operators/math/jit_kernel_refer.h
0 → 100644
浏览文件 @
7e4bd695
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_impl.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
refer
{
/* Refer code only focus on correctness */
template
<
typename
T
>
void
VMul
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
template
<
typename
T
>
void
VAdd
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
template
<
typename
T
>
void
VAddRelu
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
template
<
typename
T
>
void
VScal
(
const
T
*
a
,
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
[
0
]
*
x
[
i
];
}
}
template
<
typename
T
>
void
VAddBias
(
const
T
*
a
,
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
[
0
]
+
x
[
i
];
}
}
template
<
typename
T
>
void
VRelu
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
template
<
typename
T
>
inline
void
VIdentity
(
const
T
*
x
,
T
*
y
,
int
n
)
{}
template
<
typename
T
>
void
VExp
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
// y = 1 / (1 + e^-x)
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
T
tmp
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
std
::
exp
(
-
tmp
));
}
}
template
<
typename
T
>
void
VTanh
(
const
T
*
x
,
T
*
y
,
int
n
)
{
// y = 2 * sigmoid(2x) - 1
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
x
[
i
];
}
VSigmoid
(
y
,
y
,
n
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
y
[
i
]
-
static_cast
<
T
>
(
1
);
}
}
template
<
typename
T
>
void
(
*
getActFunc
(
const
std
::
string
&
type
))(
const
T
*
,
T
*
,
int
)
{
// NOLINT
if
(
type
==
"sigmoid"
)
{
return
VSigmoid
<
T
>
;
}
else
if
(
type
==
"relu"
)
{
return
VRelu
<
T
>
;
}
else
if
(
type
==
"tanh"
)
{
return
VTanh
<
T
>
;
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
VIdentity
<
T
>
;
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
}
// compute ct and ht
template
<
typename
T
>
void
LSTMCtHt
(
lstm_t
*
step
,
const
lstm_attr_t
*
attr
)
{
T
*
gates
=
reinterpret_cast
<
T
*>
(
step
->
gates
);
const
T
*
ct_1
=
reinterpret_cast
<
const
T
*>
(
step
->
ct_1
);
T
*
ct
=
reinterpret_cast
<
T
*>
(
step
->
ct
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
const
T
*
wp
=
reinterpret_cast
<
const
T
*>
(
step
->
wp
);
T
*
checked
=
reinterpret_cast
<
T
*>
(
step
->
checked
);
auto
act_gate
=
getActFunc
<
T
>
(
attr
->
act_gate
);
auto
act_cand
=
getActFunc
<
T
>
(
attr
->
act_cand
);
auto
act_cell
=
getActFunc
<
T
>
(
attr
->
act_cell
);
int
d
=
attr
->
d
;
int
d2
=
d
*
2
;
int
d3
=
d
*
3
;
// gates: W_ch, W_ih, W_fh, W_oh
if
(
attr
->
use_peephole
)
{
VMul
(
wp
,
ct_1
,
checked
,
d
);
VMul
(
wp
+
d
,
ct_1
,
checked
+
d
,
d
);
VAdd
(
checked
,
gates
+
d
,
gates
+
d
,
d2
);
act_gate
(
gates
+
d
,
gates
+
d
,
d2
);
}
else
{
act_gate
(
gates
+
d
,
gates
+
d
,
d3
);
}
// C_t = C_t-1 * fgated + cand_gated * igated
act_cand
(
gates
,
gates
,
d
);
VMul
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
VMul
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
VAdd
(
gates
+
d
,
gates
+
d2
,
ct
,
d
);
if
(
attr
->
use_peephole
)
{
// get ogated
VMul
(
wp
+
d2
,
ct
,
gates
+
d
,
d
);
VAdd
(
gates
+
d
,
gates
+
d3
,
gates
+
d3
,
d
);
act_gate
(
gates
+
d3
,
gates
+
d3
,
d
);
}
// H_t = act_cell(C_t) * ogated
act_cell
(
ct
,
gates
+
d2
,
d
);
VMul
(
gates
+
d2
,
gates
+
d3
,
ht
,
d
);
}
// compute c1 and h1 without c0 or h0
template
<
typename
T
>
void
LSTMC1H1
(
lstm_t
*
step
,
const
lstm_attr_t
*
attr
)
{
T
*
gates
=
reinterpret_cast
<
T
*>
(
step
->
gates
);
T
*
ct
=
reinterpret_cast
<
T
*>
(
step
->
ct
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
auto
act_gate
=
getActFunc
<
T
>
(
attr
->
act_gate
);
auto
act_cand
=
getActFunc
<
T
>
(
attr
->
act_cand
);
auto
act_cell
=
getActFunc
<
T
>
(
attr
->
act_cell
);
int
d
=
attr
->
d
;
int
d2
=
d
*
2
;
int
d3
=
d
*
3
;
/* C_t = igated * cgated*/
act_gate
(
gates
+
d
,
gates
+
d
,
d
);
act_cand
(
gates
,
gates
,
d
);
VMul
(
gates
,
gates
+
d
,
ct
,
d
);
if
(
attr
->
use_peephole
)
{
// get outgated, put W_oc * C_t on igated
const
T
*
wp
=
reinterpret_cast
<
const
T
*>
(
step
->
wp
);
VMul
(
wp
+
d2
,
ct
,
gates
+
d
,
d
);
VAdd
(
gates
+
d
,
gates
+
d3
,
gates
+
d3
,
d
);
}
/* H_t = act_cell(C_t) * ogated */
act_gate
(
gates
+
d3
,
gates
+
d3
,
d
);
act_cell
(
ct
,
gates
+
d2
,
d
);
VMul
(
gates
+
d2
,
gates
+
d3
,
ht
,
d
);
}
// compute h1 without h0
template
<
typename
T
>
void
GRUH1
(
gru_t
*
step
,
const
gru_attr_t
*
attr
)
{
T
*
gates
=
reinterpret_cast
<
T
*>
(
step
->
gates
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
auto
act_gate
=
getActFunc
<
T
>
(
attr
->
act_gate
);
auto
act_cand
=
getActFunc
<
T
>
(
attr
->
act_cand
);
int
d
=
attr
->
d
;
int
d2
=
d
*
2
;
act_gate
(
gates
,
gates
,
d
);
act_cand
(
gates
+
d2
,
gates
+
d2
,
d
);
VMul
(
gates
,
gates
+
d2
,
ht
,
d
);
}
// compute the first part of GRU: ht = act_gate(r) * ht_1
template
<
typename
T
>
void
GRUHtPart1
(
gru_t
*
step
,
const
gru_attr_t
*
attr
)
{
// W: {W_update, W_reset; W_state}
T
*
gates
=
reinterpret_cast
<
T
*>
(
step
->
gates
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
const
T
*
ht_1
=
reinterpret_cast
<
const
T
*>
(
step
->
ht_1
);
auto
act_gate
=
getActFunc
<
T
>
(
attr
->
act_gate
);
act_gate
(
gates
+
attr
->
d
,
gates
+
attr
->
d
,
attr
->
d
);
VMul
(
ht_1
,
gates
+
attr
->
d
,
ht
,
attr
->
d
);
}
// compute the second part of GRU:
// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
template
<
typename
T
>
void
GRUHtPart2
(
gru_t
*
step
,
const
gru_attr_t
*
attr
)
{
T
*
gates
=
reinterpret_cast
<
T
*>
(
step
->
gates
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
const
T
*
ht_1
=
reinterpret_cast
<
const
T
*>
(
step
->
ht_1
);
auto
act_gate
=
getActFunc
<
T
>
(
attr
->
act_gate
);
auto
act_cand
=
getActFunc
<
T
>
(
attr
->
act_cand
);
int
d
=
attr
->
d
;
T
*
y
=
gates
+
d
*
2
;
act_gate
(
gates
,
gates
,
d
);
act_cand
(
y
,
y
,
d
);
// out = zt*ht~ + (1-zt)*ht_1
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
ht
[
i
]
=
gates
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
gates
[
i
])
*
ht_1
[
i
];
}
}
}
// namespace refer
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_rnn.cc
浏览文件 @
7e4bd695
...
@@ -15,470 +15,248 @@ limitations under the License. */
...
@@ -15,470 +15,248 @@ limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
#ifdef
__AVX__
#ifdef
PADDLE_WITH_XBYAK
#include
<immintrin.h>
#include
"paddle/fluid/operators/math/jit_code.h"
#endif
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
namespace
jitkernel
{
namespace
jitkernel
{
namespace
detail
{
#ifdef __AVX__
__m256
ExpAVX
(
__m256
x
);
#endif
#ifdef __AVX2__
__m256
ExpAVX2
(
__m256
x
);
#endif
}
// namespace detail
namespace
jit
=
platform
::
jit
;
#ifdef __AVX__
typedef
enum
{
kSigmoid
,
kRelu
,
kTanh
,
kIdentity
}
act_type
;
class
AVXAct
{
/* LSTM JitKernel */
public:
template
<
typename
T
>
virtual
~
AVXAct
()
=
default
;
class
LSTMKernelImpl
:
public
LSTMKernel
<
T
>
{
virtual
__m256
Compute
(
__m256
x
)
const
=
0
;
};
template
<
act_type
type
,
jit
::
cpu_isa_t
isa
>
class
AVXActImpl
:
public
AVXAct
{
public:
public:
__m256
Compute
(
__m256
x
)
const
override
{
PADDLE_THROW
(
"Unkown type!"
);
}
static
inline
std
::
string
name
(
const
lstm_attr_t
&
attr
)
{
};
PADDLE_THROW
(
"DType should be either float or double"
);
#define AVX_SIGMOID(isa, expisa) \
template <> \
__m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const { \
__m256 ones = _mm256_set1_ps(1.0f); \
x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \
x = expisa(x); \
x = _mm256_add_ps(ones, x); \
return _mm256_div_ps(ones, x); \
}
#define AVX_TANH(isa, expisa) \
template <> \
__m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
__m256 ones = _mm256_set1_ps(1.0f); \
x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \
x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \
x = expisa(x); \
x = _mm256_add_ps(ones, x); \
x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \
return _mm256_sub_ps(x, ones); \
}
}
static
inline
bool
useJIT
(
int
d
)
{
return
false
;
}
#define AVX_RELU(isa) \
static
inline
bool
useMKL
(
int
d
)
{
return
false
;
}
template <> \
explicit
LSTMKernelImpl
(
const
lstm_attr_t
&
attr
)
:
LSTMKernel
<
T
>
()
{
__m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
#ifdef PADDLE_WITH_XBYAK
return _mm256_max_ps(x, _mm256_setzero_ps()); \
if
(
useJIT
(
attr
.
d
))
{
size_t
sz
=
96
+
attr
.
d
/
YMM_FLOAT_BLOCK
*
90
*
4
*
8
;
jitcode0_
.
reset
(
new
gen
::
LSTMJitCode
(
false
,
attr
,
sz
>
4096
?
sz
:
4096
));
this
->
ComputeCtHt
=
jitcode0_
->
getCode
<
void
(
*
)(
lstm_t
*
,
const
lstm_attr_t
*
)
>
();
jitcode1_
.
reset
(
new
gen
::
LSTMJitCode
(
true
,
attr
,
sz
>
4096
?
sz
:
4096
));
this
->
ComputeC1H1
=
jitcode1_
->
getCode
<
void
(
*
)(
lstm_t
*
,
const
lstm_attr_t
*
)
>
();
return
;
}
}
#endif
#define AVX_IDENTITY(isa) \
this
->
ComputeCtHt
=
refer
::
LSTMCtHt
<
T
>
;
template <> \
this
->
ComputeC1H1
=
refer
::
LSTMC1H1
<
T
>
;
__m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
return x; \
}
}
#define FOR_EACH_AVX_ISA(macro_) \
#ifdef PADDLE_WITH_XBYAK
macro_(jit::avx); \
macro_(jit::avx2); \
macro_(jit::avx512f)
FOR_EACH_AVX_ISA
(
AVX_RELU
);
FOR_EACH_AVX_ISA
(
AVX_IDENTITY
);
AVX_SIGMOID
(
jit
::
avx
,
detail
::
ExpAVX
);
AVX_TANH
(
jit
::
avx
,
detail
::
ExpAVX
);
#ifdef __AVX2__
private:
AVX_SIGMOID
(
jit
::
avx2
,
detail
::
ExpAVX2
);
std
::
unique_ptr
<
gen
::
LSTMJitCode
>
jitcode0_
{
nullptr
},
jitcode1_
{
nullptr
};
AVX_SIGMOID
(
jit
::
avx512f
,
detail
::
ExpAVX2
);
AVX_TANH
(
jit
::
avx2
,
detail
::
ExpAVX2
);
AVX_TANH
(
jit
::
avx512f
,
detail
::
ExpAVX2
);
#endif
#endif
};
#undef FOR_EACH_AVX_ISA
#ifdef PADDLE_WITH_XBYAK
#undef AVX_IDENTITY
template
<
>
#undef AVX_RELU
bool
LSTMKernelImpl
<
float
>::
useJIT
(
int
d
)
{
#undef AVX_TANH
return
gen
::
LSTMJitCode
::
init
(
d
);
#undef AVX_SIGMOID
}
#endif
#endif
/* Peephole JitKernel */
template
<
typename
T
>
template
<
typename
T
>
static
std
::
shared_ptr
<
const
VActKernel
<
T
>>
GetActKernel
(
class
PeepholeKernelImpl
:
public
LSTMKernel
<
T
>
{
const
std
::
string
&
type
,
int
n
)
{
public:
if
(
type
==
"sigmoid"
)
{
static
inline
std
::
string
name
(
const
lstm_attr_t
&
attr
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
PADDLE_THROW
(
"DType should be either float or double"
);
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"relu"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VReluKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"tanh"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VTanhKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VIdentityKernel
<
T
>
>
(
n
));
}
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
static
inline
bool
useJIT
(
int
d
)
{
return
false
;
}
return
nullptr
;
static
inline
bool
useMKL
(
int
d
)
{
return
false
;
}
}
explicit
PeepholeKernelImpl
(
const
lstm_attr_t
&
attr
)
:
LSTMKernel
<
T
>
()
{
#ifdef PADDLE_WITH_XBYAK
#ifdef __AVX__
if
(
useJIT
(
attr
.
d
))
{
template
<
jit
::
cpu_isa_t
isa
>
size_t
sz
=
96
+
attr
.
d
/
YMM_FLOAT_BLOCK
*
96
*
4
*
8
;
static
std
::
unique_ptr
<
AVXAct
>
GetAVXAct
(
const
std
::
string
&
type
)
{
jitcode0_
.
reset
(
new
gen
::
LSTMJitCode
(
false
,
attr
,
sz
>
4096
?
sz
:
4096
));
if
(
type
==
"sigmoid"
)
{
this
->
ComputeCtHt
=
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kSigmoid
,
isa
>
());
jitcode0_
->
getCode
<
void
(
*
)(
lstm_t
*
,
const
lstm_attr_t
*
)
>
();
}
else
if
(
type
==
"relu"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kRelu
,
isa
>
());
jitcode1_
.
reset
(
new
gen
::
LSTMJitCode
(
true
,
attr
,
sz
>
4096
?
sz
:
4096
));
}
else
if
(
type
==
"tanh"
)
{
this
->
ComputeC1H1
=
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kTanh
,
isa
>
());
jitcode1_
->
getCode
<
void
(
*
)(
lstm_t
*
,
const
lstm_attr_t
*
)
>
();
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
;
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kIdentity
,
isa
>
());
}
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
}
#endif
#endif
/* LSTM JitKernel */
this
->
ComputeCtHt
=
refer
::
LSTMCtHt
<
T
>
;
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
this
->
ComputeC1H1
=
refer
::
LSTMC1H1
<
T
>
;
class
LSTMKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
LSTMKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d3_
=
GetActKernel
<
T
>
(
act_gate
,
d3_
);
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
}
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
#ifdef PADDLE_WITH_XBYAK
T
*
checked
)
const
override
{
// gates: W_ch, W_ih, W_fh, W_oh
act_gate_d3_
->
Compute
(
gates
+
d_
,
gates
+
d_
,
d3_
);
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_
->
Compute
(
gates
,
gates
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
,
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
,
d_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
private:
private:
int
d_
,
d2_
,
d3_
;
std
::
unique_ptr
<
gen
::
LSTMJitCode
>
jitcode0_
{
nullptr
},
jitcode1_
{
nullptr
};
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d3_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
;
#ifdef __AVX__
std
::
unique_ptr
<
const
AVXAct
>
avx_act_gate_
,
avx_act_cand_
,
avx_act_cell_
;
#endif
#endif
};
};
#define INTRI8_FLOAT(isa) \
#ifdef PADDLE_WITH_XBYAK
template
<
>
bool
PeepholeKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
gen
::
LSTMJitCode
::
init
(
d
);
}
#endif
#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class) \
template <> \
template <> \
LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl( \
std::string ker_class##Impl<float>::name(const lstm_attr_t& attr) { \
const std::string& act_gate, const std::string& act_cand, \
std::string key(#ker_key "f"); \
const std::string& act_cell, int d) \
key += (attr.act_gate + attr.act_cand + attr.act_cell + \
: LSTMKernel<float>() { \
(attr.use_peephole ? "p" : "n")); \
avx_act_gate_ = GetAVXAct<isa>(act_gate); \
if (useJIT(attr.d)) { \
avx_act_cand_ = GetAVXAct<isa>(act_cand); \
/* only jit code need record d*/
\
avx_act_cell_ = GetAVXAct<isa>(act_cell); \
return key + "jit" + std::to_string(attr.d); \
} else if (useMKL(attr.d)) { \
return key + "mkl"; \
} else { \
return key + "any"; \
} \
} \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
float* gates, const float* ct_1, float* ct, float* ht, \
const float* wp_data, float* checked) const { \
/* gates: W_ch, W_ih, W_fh, W_oh */
\
__m256 c, i, f, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
f = _mm256_loadu_ps(gates + 16); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/
\
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
i = _mm256_loadu_ps(ct_1); \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \
f = _mm256_add_ps(c, f); \
_mm256_storeu_ps(ct, f); \
/* H_t = act_cell(C_t) * ogated */
\
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
} \
} \
template <> \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1( \
std::string ker_class##Impl<double>::name(const lstm_attr_t& attr) { \
float* gates, float* ct, float* ht, const float* wp_data) const { \
std::string key(#ker_key "d"); \
__m256 c, i, o; \
/* jit code do not support double yet*/
\
c = _mm256_loadu_ps(gates); \
if (useMKL(attr.d)) { \
i = _mm256_loadu_ps(gates + 8); \
return key + "mkl"; \
o = _mm256_loadu_ps(gates + 24); \
} else { \
/* C_t = igated * cgated*/
\
return key + "any"; \
c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
} \
_mm256_storeu_ps(ct, c); \
/* H_t = act_cell(C_t) * ogated */
\
o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
}
// TODO(TJ): optimize keq16
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
/* Peephole JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
PeepholeKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
PeepholeKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
vadd_d2_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d2_
);
act_gate_d2_
=
GetActKernel
<
T
>
(
act_gate
,
d2_
);
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
/* get fgated and igated*/
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
,
d2_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
,
d2_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
,
d_
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
,
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
,
d_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
}
private:
int
d_
,
d2_
,
d3_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d2_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
,
vadd_d2_
;
};
#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \
#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \
template <> \
template <> \
std::shared_ptr<const LSTMKernel<ker_dtype>> \
std::shared_ptr<const LSTMKernel<ker_dtype>> \
KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&, \
KernelPool::Get<LSTMKernel<ker_dtype>, const lstm_attr_t&>( \
const std::string&, const std::string&, int, bool>( \
const lstm_attr_t& attr)
const std::string& act_gate, const std::string& act_cand, \
const std::string& act_cell, int d, bool use_peephole)
#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \
#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
std::string key = ker_class##Impl<ker_dtype>::name(attr)
(use_peephole ? "p" : "n")
#define JITKERNEL_
NEW_LSTM_IMPL(ker, dtype, isa, k)
\
#define JITKERNEL_
LSTM_IMPL(ker, dtype)
\
if (
use_peephole) {
\
if (
attr.use_peephole) {
\
p = std::dynamic_pointer_cast<ker<dtype>>( \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<PeepholeKernelImpl<dtype, isa, k>>( \
std::make_shared<PeepholeKernelImpl<dtype>>(attr)); \
act_gate, act_cand, act_cell, d)); \
} else { \
} else { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
std::make_shared<ker##Impl<dtype>>(attr)); \
act_cell, d)); \
}
}
REGISTER_JITKERNEL_ARGS_DEPRECATED
(
lstm
,
LSTMKernel
,
JITKERNEL_DECLARE_LSTM
,
REGISTER_JITKERNEL_ARGS
(
lstm
,
LSTMKernel
,
JITKERNEL_DEFINE_NAME_LSTM
,
JITKERNEL_KEY_LSTM
,
JITKERNEL_NEW_LSTM_IMPL
);
JITKERNEL_DECLARE_LSTM
,
JITKERNEL_FIND_KEY_LSTM
,
JITKERNEL_LSTM_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_LSTM_IMPL
#undef JITKERNEL_FIND_KEY_LSTM
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_KEY_LSTM
#undef JITKERNEL_DEFINE_NAME_LSTM
#undef JITKERNEL_NEW_LSTM_IMPL
/* GRU JitKernel */
/* GRU JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
template
<
typename
T
>
class
GRUKernelImpl
:
public
GRUKernel
<
T
>
{
class
GRUKernelImpl
:
public
GRUKernel
<
T
>
{
public:
public:
explicit
GRUKernelImpl
(
const
std
::
string
&
act_gate
,
static
inline
std
::
string
name
(
const
gru_attr_t
&
attr
)
{
const
std
::
string
&
act_state
,
int
d
)
PADDLE_THROW
(
"DType should be either float or double"
);
:
GRUKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
act_gate_d2_
=
GetActKernel
<
T
>
(
act_gate
,
d2_
);
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_state_d_
=
GetActKernel
<
T
>
(
act_state
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
}
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
override
{
act_gate_d_
->
Compute
(
gates
,
gates
,
d_
);
act_state_d_
->
Compute
(
gates
+
d2_
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
,
d_
);
}
}
static
inline
bool
useJIT
(
int
d
)
{
return
false
;
}
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
static
inline
bool
useMKL
(
int
d
)
{
return
false
;
}
// W: {W_update, W_reset; W_state}
explicit
GRUKernelImpl
(
const
gru_attr_t
&
attr
)
:
GRUKernel
<
T
>
()
{
act_gate_d2_
->
Compute
(
gates
,
gates
,
d2_
);
#ifdef PADDLE_WITH_XBYAK
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
,
d_
);
if
(
useJIT
(
attr
.
d
))
{
}
size_t
sz
=
96
+
attr
.
d
/
YMM_FLOAT_BLOCK
*
96
*
2
*
8
;
jitcode0_
.
reset
(
new
gen
::
GRUJitCode
(
0
,
attr
,
sz
>
4096
?
sz
:
4096
));
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
this
->
ComputeH1
=
T
*
y
=
gates
+
d2_
;
jitcode0_
->
getCode
<
void
(
*
)(
gru_t
*
,
const
gru_attr_t
*
)
>
();
act_state_d_
->
Compute
(
y
,
y
,
d_
);
// out = zt*ht~ + (1-zt)*ht_1
jitcode1_
.
reset
(
new
gen
::
GRUJitCode
(
1
,
attr
,
sz
>
4096
?
sz
:
4096
));
for
(
int
i
=
0
;
i
<
d_
;
++
i
)
{
this
->
ComputeHtPart1
=
ht
[
i
]
=
gates
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
gates
[
i
])
*
ht_1
[
i
];
jitcode1_
->
getCode
<
void
(
*
)(
gru_t
*
,
const
gru_attr_t
*
)
>
();
jitcode2_
.
reset
(
new
gen
::
GRUJitCode
(
2
,
attr
,
sz
>
4096
?
sz
:
4096
));
this
->
ComputeHtPart2
=
jitcode2_
->
getCode
<
void
(
*
)(
gru_t
*
,
const
gru_attr_t
*
)
>
();
return
;
}
}
#endif
this
->
ComputeH1
=
refer
::
GRUH1
<
T
>
;
this
->
ComputeHtPart1
=
refer
::
GRUHtPart1
<
T
>
;
this
->
ComputeHtPart2
=
refer
::
GRUHtPart2
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
private:
private:
int
d_
,
d2_
;
std
::
unique_ptr
<
gen
::
GRUJitCode
>
jitcode0_
{
nullptr
},
jitcode1_
{
nullptr
},
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d2_
,
act_gate_d_
,
act_state_d_
;
jitcode2_
{
nullptr
};
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
#ifdef __AVX__
std
::
unique_ptr
<
const
AVXAct
>
avx_act_gate_
,
avx_act_state_
;
#endif
#endif
};
};
#define INTRI8_FLOAT(isa) \
#ifdef PADDLE_WITH_XBYAK
template
<
>
bool
GRUKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
gen
::
GRUJitCode
::
init
(
d
);
}
#endif
#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \
template <> \
template <> \
GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl( \
std::string ker_class##Impl<float>::name(const gru_attr_t& attr) { \
const std::string& act_gate, const std::string& act_state, int d) \
std::string key(#ker_key "f"); \
: GRUKernel<float>() { \
key += (attr.act_gate + attr.act_cand); \
avx_act_gate_ = GetAVXAct<isa>(act_gate); \
if (useJIT(attr.d)) { \
avx_act_state_ = GetAVXAct<isa>(act_state); \
/* only jit code need record d*/
\
return key + "jit" + std::to_string(attr.d); \
} else if (useMKL(attr.d)) { \
return key + "mkl"; \
} else { \
return key + "any"; \
} \
} \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht) \
const { \
__m256 u, s; \
/* W: {W_update, W_reset; W_state} */
\
u = _mm256_loadu_ps(gates); \
s = _mm256_loadu_ps(gates + 16); \
s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
_mm256_storeu_ps(ht, s); \
} \
} \
template <> \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1( \
std::string ker_class##Impl<double>::name(const gru_attr_t& attr) { \
float* gates, const float* ht_1, float* ht) const { \
std::string key(#ker_key "d"); \
/* not exactly equal the any implementation */
\
/* jit code do not support double yet*/
\
__m256 r, ht0; \
if (useMKL(attr.d)) { \
r = _mm256_loadu_ps(gates + 8); \
return key + "mkl"; \
ht0 = _mm256_loadu_ps(ht_1); \
} else { \
r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \
return key + "any"; \
_mm256_storeu_ps(ht, r); \
} \
} \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2( \
float* gates, const float* ht_1, float* ht) const { \
/* not exactly equal the any implementation */
\
__m256 u, s, ht0; \
u = _mm256_loadu_ps(gates); \
s = _mm256_loadu_ps(gates + 16); \
ht0 = _mm256_loadu_ps(ht_1); \
u = avx_act_gate_->Compute(u); \
s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \
u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \
u = _mm256_mul_ps(u, ht0); \
u = _mm256_add_ps(s, u); \
_mm256_storeu_ps(ht, u); \
}
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \
#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \
template <> \
template <> \
std::shared_ptr<const
GRUKernel<ker_dtype>> KernelPool::Get<
\
std::shared_ptr<const
ker_class<ker_dtype>>
\
GRUKernel<ker_dtype>, const std::string&, const std::string&, int
>( \
KernelPool::Get<ker_class<ker_dtype>, const gru_attr_t&
>( \
const
std::string& act_gate, const std::string& act_state, int d
)
const
gru_attr_t& attr
)
#define JITKERNEL_
KEY_GRU(ker_key, dtype_key
) \
#define JITKERNEL_
FIND_KEY_GRU(ker_class, ker_dtype
) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_state
std::string key = ker_class##Impl<ker_dtype>::name(attr)
#define JITKERNEL_
NEW_GRU_IMPL(ker, dtype, isa, k)
\
#define JITKERNEL_
GRU_IMPL(ker, dtype)
\
p = std::dynamic_pointer_cast<ker<dtype>>( \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype
, isa, k>>(act_gate, act_state, d
));
std::make_shared<ker##Impl<dtype
>>(attr
));
REGISTER_JITKERNEL_ARGS_DEPRECATED
(
gru
,
GRUKernel
,
JITKERNEL_DECLARE_GRU
,
REGISTER_JITKERNEL_ARGS
(
gru
,
GRUKernel
,
JITKERNEL_DEFINE_NAME_GRU
,
JITKERNEL_KEY_GRU
,
JITKERNEL_NEW_GRU_IMPL
);
JITKERNEL_DECLARE_GRU
,
JITKERNEL_FIND_KEY_GRU
,
JITKERNEL_GRU_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_GRU_IMPL
#undef JITKERNEL_NEW_GRU_IMPL
#undef JITKERNEL_FIND_KEY_GRU
#undef JITKERNEL_KEY_GRU
#undef JITKERNEL_DECLARE_GRU
#undef JITKERNEL_DECLARE_GRU
#undef JITKERNEL_DEFINE_NAME_GRU
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
7e4bd695
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <sys/time.h>
#include <cmath> // for exp
#include <cmath> // for exp
#include <cstring> // for memcpy
#include <cstring> // for memcpy
#include <random>
#include <random>
...
@@ -22,6 +21,8 @@ limitations under the License. */
...
@@ -22,6 +21,8 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/jit_kernel_refer.h"
#include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/fluid/platform/dynload/mklml.h"
...
@@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
...
@@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
}
}
}
}
void
vrelu_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0.
f
?
x
[
i
]
:
0.
f
;
}
}
#if defined __AVX__ || defined __AVX2__
#if defined __AVX__ || defined __AVX2__
void
vrelu_intri8
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
void
vrelu_intri8
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
=
_mm256_loadu_ps
(
x
);
__m256
tmp
=
_mm256_loadu_ps
(
x
);
...
@@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
...
@@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vrelu
)
{
TEST
(
JitKernel
,
vrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
3
,
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
3
,
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) {
...
@@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vrelu_ref
(
d
,
x_data
,
zref_data
);
refer
::
VRelu
<
float
>
(
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
#if defined __AVX__ || defined __AVX2__
#if defined __AVX__ || defined __AVX2__
...
@@ -90,7 +86,7 @@ TEST(JitKernel, vrelu) {
...
@@ -90,7 +86,7 @@ TEST(JitKernel, vrelu) {
vrelu_intri8
(
d
,
x_data
,
zref_data
);
vrelu_intri8
(
d
,
x_data
,
zref_data
);
}
}
auto
si1
=
GetCurrentUS
();
auto
si1
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us"
;
}
}
#endif
#endif
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
...
@@ -100,21 +96,16 @@ TEST(JitKernel, vrelu) {
...
@@ -100,21 +96,16 @@ TEST(JitKernel, vrelu) {
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
vaddbias_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
TEST
(
JitKernel
,
vaddbias
)
{
TEST
(
JitKernel
,
vaddbias
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) {
...
@@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddbias_ref
(
d
,
a
,
x_data
,
zref_data
);
refer
::
VAddBias
<
float
>
(
&
a
,
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
...
@@ -138,19 +129,13 @@ TEST(JitKernel, vaddbias) {
...
@@ -138,19 +129,13 @@ TEST(JitKernel, vaddbias) {
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
vexp_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
void
vexp_mkl
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
void
vexp_mkl
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
paddle
::
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
paddle
::
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
...
@@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
...
@@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vexp
)
{
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
12
,
15
,
16
,
20
,
30
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
12
,
15
,
16
,
20
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -170,7 +156,7 @@ TEST(JitKernel, vexp) {
...
@@ -170,7 +156,7 @@ TEST(JitKernel, vexp) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vexp_ref
(
d
,
x_data
,
zref_data
);
refer
::
VExp
<
float
>
(
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
...
@@ -196,26 +182,13 @@ TEST(JitKernel, vexp) {
...
@@ -196,26 +182,13 @@ TEST(JitKernel, vexp) {
#else
#else
<<
" us, "
<<
" us, "
#endif
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
inline
float
_sigmoid
(
float
x
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
float
tmp
=
(
x
<
min
)
?
min
:
((
x
>
max
)
?
max
:
x
);
return
1.
f
/
(
1.
f
+
std
::
exp
(
-
tmp
));
}
void
vsigmoid_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_sigmoid
(
x
[
i
]);
}
}
void
vsigmoid_better
(
void
vsigmoid_better
(
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp
,
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp
,
...
@@ -234,6 +207,7 @@ void vsigmoid_better(
...
@@ -234,6 +207,7 @@ void vsigmoid_better(
TEST
(
JitKernel
,
vsigmoid
)
{
TEST
(
JitKernel
,
vsigmoid
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) {
...
@@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) {
auto
tmkle
=
GetCurrentUS
();
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vsigmoid_ref
(
d
,
x_data
,
zref_data
);
refer
::
VSigmoid
<
float
>
(
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
...
@@ -264,21 +238,13 @@ TEST(JitKernel, vsigmoid) {
...
@@ -264,21 +238,13 @@ TEST(JitKernel, vsigmoid) {
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
inline
float
_tanh
(
float
x
)
{
return
2.
f
*
_sigmoid
(
2.
f
*
x
)
-
1.
f
;
}
void
vtanh_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_tanh
(
x
[
i
]);
}
}
void
vtanh_better
(
void
vtanh_better
(
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VScalKernel
<
float
>>&
vscal
,
const
paddle
::
operators
::
math
::
jitkernel
::
VScalKernel
<
float
>>&
vscal
,
...
@@ -298,6 +264,7 @@ void vtanh_better(
...
@@ -298,6 +264,7 @@ void vtanh_better(
TEST
(
JitKernel
,
vtanh
)
{
TEST
(
JitKernel
,
vtanh
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) {
...
@@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) {
auto
tmkle
=
GetCurrentUS
();
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vtanh_ref
(
d
,
x_data
,
zref_data
);
refer
::
VTanh
<
float
>
(
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
...
@@ -332,39 +299,13 @@ TEST(JitKernel, vtanh) {
...
@@ -332,39 +299,13 @@ TEST(JitKernel, vtanh) {
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
lstm_ctht_ref
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid_3d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VTanhKernel
<
float
>>&
vtanh_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp_1
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
,
3
*
d
);
vtanh_d
->
Compute
(
gates
,
gates
,
d
);
const
float
*
i
=
gates
+
d
,
*
f
=
gates
+
d
*
2
,
*
o
=
gates
+
d
*
3
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
k
=
0
;
k
<
d
;
++
k
)
{
// C_t = C_t-1 * fgated + cand_gated * igated
ct
[
k
]
=
ct_1
[
k
]
*
f
[
k
]
+
gates
[
k
]
*
i
[
k
];
// H_t = act_cell(C_t) * ogated
float
tmp
=
ct
[
k
]
*
2
;
tmp
=
0.
f
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vexp_1
->
Compute
(
&
tmp
,
&
tmp
,
1
);
tmp
=
2.
f
/
(
1.
f
+
tmp
)
-
1.
f
;
ht
[
k
]
=
tmp
*
o
[
k
];
}
}
void
lstm_ctht_better
(
void
lstm_ctht_better
(
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
...
@@ -389,6 +330,7 @@ void lstm_ctht_better(
...
@@ -389,6 +330,7 @@ void lstm_ctht_better(
TEST
(
JitKernel
,
lstm
)
{
TEST
(
JitKernel
,
lstm
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
int
d4
=
d
*
4
;
int
d4
=
d
*
4
;
int
d3
=
d
*
3
;
int
d3
=
d
*
3
;
...
@@ -399,19 +341,17 @@ TEST(JitKernel, lstm) {
...
@@ -399,19 +341,17 @@ TEST(JitKernel, lstm) {
RandomVec
<
float
>
(
d
,
ct_1
.
data
(),
-
2.
f
,
2.
f
);
RandomVec
<
float
>
(
d
,
ct_1
.
data
(),
-
2.
f
,
2.
f
);
memcpy
(
xref
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d4
);
memcpy
(
xref
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d4
);
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
const
jit
::
lstm_attr_t
attr
(
d
,
act_gate
,
act_cand
,
act_cell
,
false
);
const
auto
&
ker
=
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
()
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
jit
::
lstm_attr_t
&>
(
const
std
::
string
&
,
const
std
::
string
&>
(
attr
);
act_gate
,
act_cand
,
act_cell
,
d
,
false
);
// below kernels are used to compute refer
// below kernels are used to compute refer
const
auto
&
vsigmoid_3d
=
const
auto
&
vsigmoid_3d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d3
);
d3
);
const
auto
&
vtanh_d
=
const
auto
&
vtanh_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
const
auto
&
vexp_1
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
1
);
const
auto
&
vmul_d
=
const
auto
&
vmul_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
const
auto
&
vadd_d
=
const
auto
&
vadd_d
=
...
@@ -425,9 +365,17 @@ TEST(JitKernel, lstm) {
...
@@ -425,9 +365,17 @@ TEST(JitKernel, lstm) {
float
*
ct_ref_data
=
ct_ref
.
data
();
float
*
ct_ref_data
=
ct_ref
.
data
();
float
*
ht_ref_data
=
ht_ref
.
data
();
float
*
ht_ref_data
=
ht_ref
.
data
();
// compute once to check correctness
// compute once to check correctness
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
jit
::
lstm_t
step
;
ct_ref_data
,
ht_ref_data
);
step
.
gates
=
xref_data
;
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
step
.
ct_1
=
ct_1_data
;
step
.
ct
=
ct_ref_data
;
step
.
ht
=
ht_ref_data
;
refer
::
LSTMCtHt
<
float
>
(
&
step
,
&
attr
);
step
.
gates
=
x_data
;
step
.
ct
=
ct_tgt_data
;
step
.
ht
=
ht_tgt_data
;
ker
->
ComputeCtHt
(
&
step
,
&
attr
);
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ct_tgt_data
[
i
],
ct_ref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ct_tgt_data
[
i
],
ct_ref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ht_tgt_data
[
i
],
ht_ref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ht_tgt_data
[
i
],
ht_ref_data
[
i
],
1e-3
);
...
@@ -441,32 +389,21 @@ TEST(JitKernel, lstm) {
...
@@ -441,32 +389,21 @@ TEST(JitKernel, lstm) {
auto
tmkle
=
GetCurrentUS
();
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
refer
::
LSTMCtHt
<
float
>
(
&
step
,
&
attr
);
ct_ref_data
,
ht_ref_data
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
ker
->
ComputeCtHt
(
&
step
,
&
attr
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
}
}
}
}
void
vscal_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
vscal_inp_ref
(
const
int
n
,
const
float
a
,
float
*
x
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
#if defined __AVX__ || defined __AVX2__
void
vscal_intri8
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
void
vscal_intri8
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
;
__m256
tmp
;
...
@@ -492,6 +429,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) {
...
@@ -492,6 +429,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) {
TEST
(
JitKernel
,
vscal
)
{
TEST
(
JitKernel
,
vscal
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -506,12 +444,12 @@ TEST(JitKernel, vscal) {
...
@@ -506,12 +444,12 @@ TEST(JitKernel, vscal) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_ref
(
d
,
a
,
x_data
,
zref_data
);
refer
::
VScal
<
float
>
(
&
a
,
x_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
trefs1
=
GetCurrentUS
();
auto
trefs1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_ref
(
d
,
a
,
y_data
);
refer
::
VScal
<
float
>
(
&
a
,
y_data
,
y_data
,
d
);
}
}
auto
trefe1
=
GetCurrentUS
();
auto
trefe1
=
GetCurrentUS
();
...
@@ -536,7 +474,7 @@ TEST(JitKernel, vscal) {
...
@@ -536,7 +474,7 @@ TEST(JitKernel, vscal) {
}
}
auto
si3
=
GetCurrentUS
();
auto
si3
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
VLOG
(
30
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
;
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
<<
" us"
;
}
}
#endif
#endif
...
@@ -560,19 +498,14 @@ TEST(JitKernel, vscal) {
...
@@ -560,19 +498,14 @@ TEST(JitKernel, vscal) {
<<
" us, "
<<
" us, "
#endif
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
;
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
vmul_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
#if defined __AVX__ || defined __AVX2__
void
vmul_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
void
vmul_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
__m256
tmpx
,
tmpy
;
...
@@ -591,6 +524,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {
...
@@ -591,6 +524,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {
TEST
(
JitKernel
,
vmul
)
{
TEST
(
JitKernel
,
vmul
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
20
,
30
,
256
,
512
,
1000
,
1024
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
20
,
30
,
256
,
512
,
1000
,
1024
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -604,7 +538,7 @@ TEST(JitKernel, vmul) {
...
@@ -604,7 +538,7 @@ TEST(JitKernel, vmul) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_ref
(
d
,
x_data
,
y_data
,
zref_data
);
refer
::
VMul
<
float
>
(
x_data
,
y_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
...
@@ -640,19 +574,13 @@ TEST(JitKernel, vmul) {
...
@@ -640,19 +574,13 @@ TEST(JitKernel, vmul) {
#else
#else
<<
" us, "
<<
" us, "
#endif
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
vadd_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
#if defined __AVX__ || defined __AVX2__
void
vadd_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
void
vadd_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
__m256
tmpx
,
tmpy
;
...
@@ -671,6 +599,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) {
...
@@ -671,6 +599,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) {
TEST
(
JitKernel
,
vadd
)
{
TEST
(
JitKernel
,
vadd
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -684,7 +613,7 @@ TEST(JitKernel, vadd) {
...
@@ -684,7 +613,7 @@ TEST(JitKernel, vadd) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
refer
::
VAdd
<
float
>
(
x_data
,
y_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
...
@@ -720,19 +649,13 @@ TEST(JitKernel, vadd) {
...
@@ -720,19 +649,13 @@ TEST(JitKernel, vadd) {
#else
#else
<<
" us, "
<<
" us, "
#endif
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
}
}
}
void
vaddrelu_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
void
vaddrelu_better
(
void
vaddrelu_better
(
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd
,
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd
,
...
@@ -745,6 +668,7 @@ void vaddrelu_better(
...
@@ -745,6 +668,7 @@ void vaddrelu_better(
TEST
(
JitKernel
,
vaddrelu
)
{
TEST
(
JitKernel
,
vaddrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
refer
=
paddle
::
operators
::
math
::
jitkernel
::
refer
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
...
@@ -762,7 +686,7 @@ TEST(JitKernel, vaddrelu) {
...
@@ -762,7 +686,7 @@ TEST(JitKernel, vaddrelu) {
float
*
zref_data
=
zref
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddrelu_ref
(
d
,
x_data
,
y_data
,
zref_data
);
refer
::
VAddRelu
<
float
>
(
x_data
,
y_data
,
zref_data
,
d
);
}
}
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
...
@@ -778,7 +702,7 @@ TEST(JitKernel, vaddrelu) {
...
@@ -778,7 +702,7 @@ TEST(JitKernel, vaddrelu) {
VLOG
(
30
)
<<
"Vec size "
<<
d
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
...
@@ -789,21 +713,23 @@ TEST(JitKernel, pool) {
...
@@ -789,21 +713,23 @@ TEST(JitKernel, pool) {
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
const
int
frame_size
=
4
;
const
int
frame_size
=
4
;
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
jit
::
lstm_attr_t
attr
(
frame_size
,
act_gate
,
act_cand
,
act_cell
,
false
);
// empty call it to avoid unknown flag 'use_pinned_memory' on Mac
paddle
::
platform
::
jit
::
MayIUse
(
paddle
::
platform
::
jit
::
avx
);
const
auto
&
plstm1
=
const
auto
&
plstm1
=
jit
::
KernelPool
::
Instance
()
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
jit
::
lstm_attr_t
&>
(
attr
);
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
plstm2
=
const
auto
&
plstm2
=
jit
::
KernelPool
::
Instance
()
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
jit
::
lstm_attr_t
&>
(
attr
);
const
std
::
string
&
,
const
std
::
string
&>
(
EXPECT_EQ
(
plstm1
,
plstm2
);
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
peephole
=
const
auto
&
peephole
=
jit
::
KernelPool
::
Instance
()
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
jit
::
lstm_attr_t
&>
(
const
std
::
string
&
,
const
std
::
string
&>
(
jit
::
lstm_attr_t
(
frame_size
,
act_gate
,
act_cand
,
act_cell
,
true
));
act_gate
,
act_cand
,
act_cell
,
frame_size
,
true
);
EXPECT_TRUE
(
plstm1
!=
peephole
);
EXPECT_TRUE
(
plstm1
!=
peephole
);
const
auto
&
pvmul_f
=
const
auto
&
pvmul_f
=
...
...
paddle/fluid/operators/sum_op.h
浏览文件 @
7e4bd695
...
@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
...
@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_add
;
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_add
;
merge_add
(
context
.
template
device_context
<
DeviceContext
>(),
inputs
,
merge_add
(
context
.
template
device_context
<
DeviceContext
>(),
inputs
,
out
);
out
);
out
->
SyncIndex
();
}
else
{
}
else
{
// no data, just set a empty out tensor.
// no data, just set a empty out tensor.
out
->
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
0
}),
out
->
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
0
}),
...
...
paddle/fluid/operators/tensor_array_to_tensor_op.cc
浏览文件 @
7e4bd695
...
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
...
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_inx_dim
[
0
]
=
inx
.
size
();
out_inx_dim
[
0
]
=
inx
.
size
();
out_inx
.
Resize
(
out_inx_dim
);
out_inx
.
Resize
(
out_inx_dim
);
auto
&
local_scope
=
scope
.
NewScope
();
std
::
string
var_name
=
"out_index"
;
std
::
string
var_name
=
"out_index"
;
framework
::
Variable
*
tmp_index_var
=
framework
::
Variable
*
tmp_index_var
=
local_scope
.
Var
(
var_name
);
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
auto
&
tmp_index_tensor
=
auto
&
tmp_index_tensor
=
*
(
tmp_index_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
*
(
tmp_index_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
tmp_index_tensor
.
Resize
(
out_inx_dim
);
tmp_index_tensor
.
Resize
(
out_inx_dim
);
...
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
...
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_dims
[
axis
]
=
out_dim_sum
;
out_dims
[
axis
]
=
out_dim_sum
;
out
.
Resize
(
out_dims
);
out
.
Resize
(
out_dims
);
LodTensorArray2LodTensorVector
(
scope
,
base_name
,
Input
(
"X"
),
&
names
);
LodTensorArray2LodTensorVector
(
local_
scope
,
base_name
,
Input
(
"X"
),
&
names
);
// Invoke
Reshape
Op
// Invoke
concat
Op
auto
concat_op
=
framework
::
OpRegistry
::
CreateOp
(
auto
concat_op
=
framework
::
OpRegistry
::
CreateOp
(
"concat"
,
{{
"X"
,
names
}},
{{
"Out"
,
{
Output
(
"Out"
)}}},
attrs
);
"concat"
,
{{
"X"
,
names
}},
{{
"Out"
,
{
Output
(
"Out"
)}}},
attrs
);
concat_op
->
Run
(
scope
,
place
);
concat_op
->
Run
(
local_
scope
,
place
);
}
}
};
};
...
...
paddle/fluid/platform/cpu_helper.cc
浏览文件 @
7e4bd695
...
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
...
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
#elif defined(PADDLE_WITH_MKLML)
#elif defined(PADDLE_WITH_MKLML)
int
real_num_threads
=
num_threads
>
1
?
num_threads
:
1
;
int
real_num_threads
=
num_threads
>
1
?
num_threads
:
1
;
platform
::
dynload
::
MKL_Set_Num_Threads
(
real_num_threads
);
platform
::
dynload
::
MKL_Set_Num_Threads
(
real_num_threads
);
omp_set_num_threads
(
num_threads
);
omp_set_num_threads
(
real_
num_threads
);
#else
#else
PADDLE_ENFORCE
(
false
,
"To be implemented."
);
PADDLE_ENFORCE
(
false
,
"To be implemented."
);
#endif
#endif
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
7e4bd695
...
@@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
...
@@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_ENFORCE(condition) \
#define CUDNN_ENFORCE(condition) \
do { \
do { \
cudnnStatus_t status = condition;
\
auto status = condition;
\
if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \
if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \
} \
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
7e4bd695
...
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
...
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
std
::
unique_ptr
<
std
::
lock_guard
<
std
::
mutex
>>
guard_
;
std
::
unique_ptr
<
std
::
lock_guard
<
std
::
mutex
>>
guard_
;
};
};
#if CUDA_VERSION >= 9000
class
ScopedCublasMathMode
{
public:
ScopedCublasMathMode
(
cublasHandle_t
handle
,
cublasMath_t
new_math_mode
)
:
handle_
(
handle
)
{
need_reset
=
false
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGetMathMode
(
handle_
,
&
old_math_mode_
),
"Failed to get old cublas math mode"
);
if
(
old_math_mode_
!=
new_math_mode
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
new_math_mode
),
"Failed to set old cublas math mode"
);
need_reset
=
true
;
}
}
~
ScopedCublasMathMode
()
{
if
(
need_reset
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
old_math_mode_
),
"Failed to set old cublas math mode"
);
}
}
private:
cublasHandle_t
handle_
;
cublasMath_t
old_math_mode_
;
bool
need_reset
;
};
#endif
class
CUDADeviceContext
:
public
DeviceContext
{
class
CUDADeviceContext
:
public
DeviceContext
{
public:
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
explicit
CUDADeviceContext
(
CUDAPlace
place
);
...
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
callback_manager_
->
Wait
();
callback_manager_
->
Wait
();
}
}
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template
<
typename
Callback
>
void
CublasCall
(
Callback
callback
,
cublasMath_t
new_math
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
cublas_mtx_
);
ScopedCublasMathMode
scoped_cublas_math
(
cublas_handle_
,
new_math
);
callback
();
}
#endif
private:
private:
CUDAPlace
place_
;
CUDAPlace
place_
;
...
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable
std
::
mutex
callback_mtx_
;
mutable
std
::
mutex
callback_mtx_
;
std
::
unique_ptr
<
StreamCallbackManager
>
callback_manager_
;
std
::
unique_ptr
<
StreamCallbackManager
>
callback_manager_
;
mutable
std
::
mutex
cublas_mtx_
;
};
};
template
<
>
template
<
>
...
...
paddle/fluid/platform/dynload/cublas.cc
浏览文件 @
7e4bd695
...
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
...
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DEFINE_WRAP
);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DEFINE_WRAP
);
#endif
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DEFINE_WRAP
);
#endif
}
// namespace dynload
}
// namespace dynload
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/platform/dynload/cublas.h
浏览文件 @
7e4bd695
...
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
...
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
extern DynLoad__##__name __name
extern DynLoad__##__name __name
#endif
#endif
#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
__macro(cublasDaxpy_v2); \
...
@@ -106,11 +103,22 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
...
@@ -106,11 +103,22 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 9.0
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
#endif
// APIs available after CUDA 9.1
#if CUDA_VERSION >= 9010
#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
}
// namespace dynload
}
// namespace dynload
}
// namespace platform
}
// namespace platform
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
7e4bd695
...
@@ -51,7 +51,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
...
@@ -51,7 +51,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
struct DynLoad__##__name { \
template <typename... Args> \
template <typename... Args> \
inline
cudnnStatus_t operator()(Args... args) {
\
inline
auto operator()(Args... args) {
\
return ::__name(args...); \
return ::__name(args...); \
} \
} \
}; \
}; \
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
7e4bd695
...
@@ -19,13 +19,32 @@ limitations under the License. */
...
@@ -19,13 +19,32 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_double
(
fraction_of_gpu_memory_to_use
,
0.92
,
#ifndef _WIN32
const
float
fraction_of_gpu_memory_to_use
=
0.92
f
;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
const
float
fraction_of_gpu_memory_to_use
=
0.5
f
;
#endif
DEFINE_double
(
fraction_of_gpu_memory_to_use
,
fraction_of_gpu_memory_to_use
,
"Allocate a trunk of gpu memory that is this fraction of the "
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
"total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, "
"from the trunk. If the trunk doesn't have enough gpu memory, "
"additional trunks of the same size will be requested from gpu "
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
"until the gpu has no memory left for another trunk."
);
DEFINE_bool
(
enable_cublas_tensor_op_math
,
false
,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that"
" use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
" GEMM computations(the matrices must be either half precision or single "
"precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
"input and output must be half precision) and recurrent neural networks "
"(RNNs)."
);
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -64,6 +83,16 @@ int GetCUDADriverVersion(int id) {
...
@@ -64,6 +83,16 @@ int GetCUDADriverVersion(int id) {
return
driver_version
;
return
driver_version
;
}
}
bool
TensorCoreAvailable
()
{
#if CUDA_VERSION >= 9000
int
device
=
GetCurrentDeviceId
();
int
driver_version
=
GetCUDAComputeCapability
(
device
);
return
driver_version
>=
70
;
#else
return
false
;
#endif
}
int
GetCUDAMultiProcessors
(
int
id
)
{
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
int
count
;
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
7e4bd695
...
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
...
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
//! Get the driver version of the ith GPU
//! Get the driver version of the ith GPU
int
GetCUDADriverVersion
(
int
id
);
int
GetCUDADriverVersion
(
int
id
);
//! Wheter the current device support TensorCore
bool
TensorCoreAvailable
();
//! Get the MultiProcessors of the ith GPU.
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
int
GetCUDAMultiProcessors
(
int
i
);
...
...
paddle/fluid/platform/stream_callback_manager.h
浏览文件 @
7e4bd695
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
#pragma once
#pragma once
#include <ThreadPool.h>
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <functional>
#include <functional>
#include <memory>
#include <memory>
#include "ThreadPool.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
...
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
if
(
WITH_AMD_GPU
)
if
(
WITH_AMD_GPU
)
hip_library
(
paddle_pybind SHARED
hip_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
DEPS
ARCHIVE_START
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
ARCHIVE_END
)
else
()
else
()
cc_library
(
paddle_pybind SHARED
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
SRCS
${
PYBIND_SRCS
}
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
7e4bd695
...
@@ -30,11 +30,12 @@ namespace pybind11 {
...
@@ -30,11 +30,12 @@ namespace pybind11 {
namespace
detail
{
namespace
detail
{
// Can be replaced by a generic lambda in C++14
// Can be replaced by a generic lambda in C++14
struct
variant_caster_visitor
:
public
boost
::
static_visitor
<
handle
>
{
struct
__attribute__
((
visibility
(
"hidden"
)))
paddle_variant_caster_visitor
:
public
boost
::
static_visitor
<
handle
>
{
return_value_policy
policy
;
return_value_policy
policy
;
handle
parent
;
handle
parent
;
variant_caster_visitor
(
return_value_policy
policy
,
handle
parent
)
paddle_
variant_caster_visitor
(
return_value_policy
policy
,
handle
parent
)
:
policy
(
policy
),
parent
(
parent
)
{}
:
policy
(
policy
),
parent
(
parent
)
{}
template
<
class
T
>
template
<
class
T
>
...
@@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor<handle> {
...
@@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor<handle> {
};
};
template
<
class
Variant
>
template
<
class
Variant
>
struct
variant_caster
;
struct
paddle_
variant_caster
;
template
<
template
<
class
...
>
class
V
,
class
...
Ts
>
template
<
template
<
class
...
>
class
V
,
class
...
Ts
>
struct
variant_caster
<
V
<
Ts
...
>>
{
struct
paddle_
variant_caster
<
V
<
Ts
...
>>
{
using
Type
=
V
<
Ts
...
>
;
using
Type
=
V
<
Ts
...
>
;
template
<
typename
T
>
template
<
typename
T
>
...
@@ -90,7 +91,7 @@ struct variant_caster<V<Ts...>> {
...
@@ -90,7 +91,7 @@ struct variant_caster<V<Ts...>> {
static
handle
cast
(
Type
const
&
src
,
return_value_policy
policy
,
static
handle
cast
(
Type
const
&
src
,
return_value_policy
policy
,
handle
parent
)
{
handle
parent
)
{
variant_caster_visitor
visitor
(
policy
,
parent
);
paddle_
variant_caster_visitor
visitor
(
policy
,
parent
);
return
boost
::
apply_visitor
(
visitor
,
src
);
return
boost
::
apply_visitor
(
visitor
,
src
);
}
}
...
@@ -101,7 +102,7 @@ struct variant_caster<V<Ts...>> {
...
@@ -101,7 +102,7 @@ struct variant_caster<V<Ts...>> {
// Add specialization for concrete variant type
// Add specialization for concrete variant type
template
<
class
...
Args
>
template
<
class
...
Args
>
struct
type_caster
<
boost
::
variant
<
Args
...
>>
struct
type_caster
<
boost
::
variant
<
Args
...
>>
:
variant_caster
<
boost
::
variant
<
Args
...
>>
{};
:
paddle_
variant_caster
<
boost
::
variant
<
Args
...
>>
{};
}
// namespace detail
}
// namespace detail
}
// namespace pybind11
}
// namespace pybind11
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
7e4bd695
...
@@ -86,12 +86,12 @@ bool IsCompiledWithDIST() {
...
@@ -86,12 +86,12 @@ bool IsCompiledWithDIST() {
#endif
#endif
}
}
PYBIND11_
PLUGIN
(
core
)
{
PYBIND11_
MODULE
(
core
,
m
)
{
// Not used, just make sure cpu_info.cc is linked.
// Not used, just make sure cpu_info.cc is linked.
paddle
::
platform
::
CpuTotalPhysicalMemory
();
paddle
::
platform
::
CpuTotalPhysicalMemory
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
py
::
module
m
(
"core"
,
"C++ core of PaddlePaddle"
)
;
m
.
doc
()
=
"C++ core of PaddlePaddle"
;
// using framework in this function. Since it is inside a function, it will
// using framework in this function. Since it is inside a function, it will
// not cause namespace pollution.
// not cause namespace pollution.
...
@@ -907,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -907,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
});
});
BindRecordIOWriter
(
&
m
);
BindRecordIOWriter
(
&
m
);
return
m
.
ptr
();
}
}
}
// namespace pybind
}
// namespace pybind
}
// namespace paddle
}
// namespace paddle
paddle/fluid/pybind/tensor_py.h
浏览文件 @
7e4bd695
...
@@ -21,7 +21,6 @@ limitations under the License. */
...
@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/float16.h"
#include "pybind11/common.h"
#include "pybind11/numpy.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/pybind11.h"
...
...
paddle/legacy/cuda/include/hl_warpctc_wrap.h
浏览文件 @
7e4bd695
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifndef _WIN32
#ifndef HL_WARPCTC_WRAP_H_
#ifndef HL_WARPCTC_WRAP_H_
#define HL_WARPCTC_WRAP_H_
#define HL_WARPCTC_WRAP_H_
#include "ctc.h"
#include "ctc.h"
#include "hl_base.h"
#include "hl_base.h"
...
@@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
...
@@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
size_t
*
bytes
);
size_t
*
bytes
);
#endif // HL_WARPCTC_WRAP_H_
#endif // HL_WARPCTC_WRAP_H_
#endif
paddle/legacy/cuda/src/hl_cuda_device.cc
浏览文件 @
7e4bd695
...
@@ -132,11 +132,15 @@ inline pid_t gettid() {
...
@@ -132,11 +132,15 @@ inline pid_t gettid() {
uint64_t
tid
;
uint64_t
tid
;
pthread_threadid_np
(
NULL
,
&
tid
);
pthread_threadid_np
(
NULL
,
&
tid
);
#else
#else
#ifndef _WIN32
#ifndef __NR_gettid
#ifndef __NR_gettid
#define __NR_gettid 224
#define __NR_gettid 224
#endif
#endif
pid_t
tid
=
syscall
(
__NR_gettid
);
pid_t
tid
=
syscall
(
__NR_gettid
);
#endif
#endif
#else // _WIN32
pid_t
tid
=
_getpid
();
#endif // _WIN32
CHECK_NE
((
int
)
tid
,
-
1
);
CHECK_NE
((
int
)
tid
,
-
1
);
return
tid
;
return
tid
;
}
}
...
...
paddle/legacy/utils/ThreadLocal.h
浏览文件 @
7e4bd695
...
@@ -14,10 +14,12 @@ limitations under the License. */
...
@@ -14,10 +14,12 @@ limitations under the License. */
#pragma once
#pragma once
#ifndef _WIN32
#include <pthread.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <unistd.h>
#endif
#include <sys/types.h>
#include <map>
#include <map>
#include <mutex>
#include <mutex>
#include <random>
#include <random>
...
...
paddle/legacy/utils/Util.h
浏览文件 @
7e4bd695
...
@@ -14,7 +14,9 @@ limitations under the License. */
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#pragma once
#ifndef _WIN32
#include <sys/syscall.h> // for syscall()
#include <sys/syscall.h> // for syscall()
#endif
#include <sys/types.h>
#include <sys/types.h>
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cmath>
...
@@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) {
...
@@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) {
}
}
#endif
#endif
#ifdef _WIN32
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
template
<
typename
T
>
inline
int
__builtin_clz
(
const
T
&
value
)
{
DWORD
leadning_zero
=
0
;
if
(
_BitScanReverse
(
&
leadning_zero
,
value
))
{
return
static_cast
<
int
>
(
sizeof
(
T
)
*
8
-
leadning_zero
);
}
else
{
return
static_cast
<
int
>
(
0
);
}
}
inline
int
__builtin_clzl
(
const
unsigned
long
&
value
)
{
return
__builtin_clz
(
value
);
}
inline
int
__builtin_clzll
(
const
unsigned
long
long
&
value
)
{
return
__builtin_clz
(
value
);
}
#define pid_t int
#endif
/**
/**
* Loop over the elements in a container
* Loop over the elements in a container
* TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
* TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
7e4bd695
...
@@ -94,6 +94,30 @@ function cmake_gen() {
...
@@ -94,6 +94,30 @@ function cmake_gen() {
else
else
exit
1
exit
1
fi
fi
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.6"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.7"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
fi
fi
else
else
if
[
"
$1
"
!=
""
]
;
then
if
[
"
$1
"
!=
""
]
;
then
...
@@ -116,6 +140,18 @@ function cmake_gen() {
...
@@ -116,6 +140,18 @@ function cmake_gen() {
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.6.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.6.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.7.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.7.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
fi
fi
fi
fi
fi
fi
...
@@ -419,7 +455,7 @@ function assert_api_not_changed() {
...
@@ -419,7 +455,7 @@ function assert_api_not_changed() {
source
.env/bin/activate
source
.env/bin/activate
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
python
${
PADDLE_ROOT
}
/tools/print_signatures.py paddle.fluid
>
new.spec
python
${
PADDLE_ROOT
}
/tools/print_signatures.py paddle.fluid
>
new.spec
if
[
"
$1
"
==
"cp35-cp35m"
]
;
then
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
;
then
# Use sed to make python2 and python3 sepc keeps the same
# Use sed to make python2 and python3 sepc keeps the same
sed
-i
's/arg0: str/arg0: unicode/g'
new.spec
sed
-i
's/arg0: str/arg0: unicode/g'
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
...
@@ -635,6 +671,55 @@ EOF
...
@@ -635,6 +671,55 @@ EOF
${
DOCKERFILE_CUBLAS_DSO
}
${
DOCKERFILE_CUBLAS_DSO
}
${
DOCKERFILE_GPU_ENV
}
${
DOCKERFILE_GPU_ENV
}
ENV NCCL_LAUNCH_MODE PARALLEL
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&
${
NCCL_DEPS
}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz &&
\
tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 &&
\
./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz &&
\
wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz &&
\
tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 &&
\
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null &&
\
make -j8 > /dev/null && make altinstall > /dev/null
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk &&
\
pip3.6 install opencv-python && pip3.6 install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
${
PADDLE_VERSION
}
&&
\
ldconfig
${
DOCKERFILE_CUDNN_DSO
}
${
DOCKERFILE_CUBLAS_DSO
}
${
DOCKERFILE_GPU_ENV
}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&
${
NCCL_DEPS
}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz &&
\
tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 &&
\
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null &&
\
make -j8 > /dev/null && make altinstall > /dev/null
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk &&
\
pip3.7 install opencv-python && pip3.7 install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
${
PADDLE_VERSION
}
&&
\
ldconfig
${
DOCKERFILE_CUDNN_DSO
}
${
DOCKERFILE_CUBLAS_DSO
}
${
DOCKERFILE_GPU_ENV
}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
EOF
else
else
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
...
...
paddle/testing/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -3,8 +3,10 @@
...
@@ -3,8 +3,10 @@
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
add_library
(
paddle_test_main STATIC TestMain.cpp
)
add_library
(
paddle_test_main STATIC TestMain.cpp
)
add_dependencies
(
paddle_test_main paddle_proto
${
external_project_dependencies
}
)
add_dependencies
(
paddle_test_main paddle_proto
${
external_project_dependencies
}
)
if
(
NOT WIN32
)
add_library
(
paddle_test_util STATIC TestUtil.cpp
)
add_library
(
paddle_test_util STATIC TestUtil.cpp
)
add_dependencies
(
paddle_test_util paddle_proto
${
external_project_dependencies
}
)
add_dependencies
(
paddle_test_util paddle_proto
${
external_project_dependencies
}
)
endif
(
NOT WIN32
)
if
(
NOT MOBILE_INFERENCE
)
if
(
NOT MOBILE_INFERENCE
)
cc_library
(
paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags
)
cc_library
(
paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags
)
endif
()
endif
()
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
7e4bd695
...
@@ -28,9 +28,14 @@ int main(int argc, char** argv) {
...
@@ -28,9 +28,14 @@ int main(int argc, char** argv) {
for
(
int
i
=
0
;
i
<
argc
;
++
i
)
{
for
(
int
i
=
0
;
i
<
argc
;
++
i
)
{
new_argv
.
push_back
(
argv
[
i
]);
new_argv
.
push_back
(
argv
[
i
]);
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv
.
push_back
(
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
));
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
));
#elif __clang__
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
#else
#else
new_argv
.
push_back
(
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
...
...
python/paddle/fluid/__init__.py
浏览文件 @
7e4bd695
...
@@ -91,6 +91,7 @@ def __bootstrap__():
...
@@ -91,6 +91,7 @@ def __bootstrap__():
"""
"""
import
sys
import
sys
import
os
import
os
import
platform
from
.
import
core
from
.
import
core
in_test
=
'unittest'
in
sys
.
modules
in_test
=
'unittest'
in
sys
.
modules
...
@@ -110,14 +111,17 @@ def __bootstrap__():
...
@@ -110,14 +111,17 @@ def __bootstrap__():
print
(
'PLEASE USE OMP_NUM_THREADS WISELY.'
,
file
=
sys
.
stderr
)
print
(
'PLEASE USE OMP_NUM_THREADS WISELY.'
,
file
=
sys
.
stderr
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
read_env_flags
=
[
read_env_flags
=
[
'
use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope
'
,
'
check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_mkldnn
'
,
'use_
mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb
'
,
'use_
ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
'
free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
]
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
if
os
.
name
!=
'nt'
:
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'cpu_deterministic'
)
read_env_flags
.
append
(
'cpu_deterministic'
)
...
@@ -129,11 +133,13 @@ def __bootstrap__():
...
@@ -129,11 +133,13 @@ def __bootstrap__():
read_env_flags
.
append
(
'rpc_send_thread_num'
)
read_env_flags
.
append
(
'rpc_send_thread_num'
)
read_env_flags
.
append
(
'rpc_get_thread_num'
)
read_env_flags
.
append
(
'rpc_get_thread_num'
)
read_env_flags
.
append
(
'rpc_prefetch_thread_num'
)
read_env_flags
.
append
(
'rpc_prefetch_thread_num'
)
read_env_flags
.
append
(
'rpc_disable_reuse_port'
)
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
]
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
...
...
python/paddle/fluid/contrib/utils/__init__.py
浏览文件 @
7e4bd695
...
@@ -13,8 +13,10 @@
...
@@ -13,8 +13,10 @@
# limitations under the License.
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
from
.
import
lookup_table_utils
from
.lookup_table_utils
import
*
from
.
import
hdfs_utils
from
.
import
hdfs_utils
from
.hdfs_utils
import
*
from
.hdfs_utils
import
*
__all__
=
lookup_table_utils
.
__all__
__all__
=
hdfs_utils
.
__all__
__all__
=
hdfs_utils
.
__all__
python/paddle/fluid/contrib/utils/lookup_table_utils.py
0 → 100644
浏览文件 @
7e4bd695
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
time
import
logging
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
io
from
paddle.fluid
import
Program
__all__
=
[
"load_inference_model"
,
"load_persistable_vars"
,
"convert_dist_to_sparse_program"
]
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
)
_logger
=
logging
.
getLogger
(
"lookup_table_utils"
)
_logger
.
setLevel
(
logging
.
INFO
)
model_filename
=
"__model__"
lookup_table_dir
=
"__lookup_table__"
def
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
w
,
out
):
main_program
.
global_block
().
_insert_op
(
index
=
idx
,
type
=
"lookup_sparse_table"
,
inputs
=
{
"Ids"
:
[
ids
],
"W"
:
[
w
]},
outputs
=
{
"Out"
:
[
out
]},
attrs
=
{
"is_distributed"
:
False
,
"is_sparse"
:
True
,
"grad_inplace"
:
False
})
def
__get_prefetch_op_tuples
(
main_program
):
# current lookup tables op is split_ids->prefetch->merge_ids
prefetch_op_tuples
=
None
op_types
=
[
op
.
type
for
op
in
main_program
.
global_block
().
ops
]
for
i
in
range
(
len
(
op_types
)):
if
op_types
[
i
]
==
"prefetch"
:
if
op_types
[
i
-
1
]
==
"split_ids"
and
op_types
[
i
+
1
]
==
"merge_ids"
:
split_ids_op_id
=
i
-
1
split_ids_inputs
=
main_program
.
global_block
().
ops
[
i
-
1
].
input
(
"Ids"
)
prefetch_op_inputs
=
main_program
.
global_block
().
ops
[
i
].
input
(
"X"
)
prefetch_op_outputs
=
main_program
.
global_block
().
ops
[
i
].
output
(
"Out"
)
merge_ids_outputs
=
main_program
.
global_block
().
ops
[
i
+
1
].
output
(
"Out"
)
need_delete_vars
=
[]
need_delete_vars
.
extend
(
prefetch_op_inputs
)
need_delete_vars
.
extend
(
prefetch_op_outputs
)
prefetch_op_tuples
=
(
split_ids_op_id
,
split_ids_inputs
,
merge_ids_outputs
,
need_delete_vars
)
break
return
prefetch_op_tuples
def
convert_dist_to_sparse_program
(
main_program
):
if
not
main_program
.
_distributed_lookup_table
:
_logger
.
warn
(
"There are no distributed lookup tables need to be converted"
)
return
# create table param and grad var in pserver program
origin_emb_var
=
"{}.origin"
.
format
(
main_program
.
_distributed_lookup_table
)
emb_var
=
main_program
.
_distributed_lookup_table
main_program
.
global_block
().
_rename_var
(
emb_var
,
origin_emb_var
)
origin_param_var
=
main_program
.
global_block
().
vars
[
origin_emb_var
]
param_var
=
main_program
.
global_block
().
create_var
(
name
=
emb_var
,
shape
=
origin_param_var
.
shape
,
dtype
=
origin_param_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
# parameter must be selected rows
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
main_program
.
_sync_with_cpp
()
prefetch_op_tuples
=
__get_prefetch_op_tuples
(
main_program
)
split_ids_id
=
prefetch_op_tuples
[
0
]
for
idx
in
range
(
split_ids_id
+
2
,
split_ids_id
-
1
,
-
1
):
main_program
.
global_block
().
_remove_op
(
idx
)
main_program
.
desc
.
flush
()
in_out_pairs
=
zip
(
prefetch_op_tuples
[
1
],
prefetch_op_tuples
[
2
])
for
in_out_pair
in
in_out_pairs
:
idx
=
split_ids_id
ids
=
main_program
.
global_block
().
vars
[
in_out_pair
[
0
]]
out
=
main_program
.
global_block
().
vars
[
in_out_pair
[
1
]]
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
param_var
,
out
)
main_program
.
desc
.
flush
()
return
main_program
def
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var
):
def
_is_checkpoint_var
(
exclude_fluid_vars
=
None
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
exclude_fluid_vars
is
None
:
exclude_fluid_vars
=
[]
def
is_valid
(
var
):
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
if
var
.
name
in
exclude_fluid_vars
:
return
False
return
var
.
persistable
return
is_valid
def
_load_lookup_table_vars
(
executor
,
dirname
,
main_program
,
lookup_table_vars
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
lookup_table_dirname
=
os
.
path
.
join
(
dirname
,
lookup_table_dir
)
emb_var_name
=
lookup_table_vars
[
0
]
emb_var
=
main_program
.
global_block
().
var
(
emb_var_name
)
emb_files
=
[]
for
emb_name
in
os
.
listdir
(
lookup_table_dirname
):
if
emb_var_name
in
emb_name
:
emb_files
.
append
(
emb_name
)
convert_program
=
Program
()
global_block
=
convert_program
.
global_block
()
emb_var
=
global_block
.
create_var
(
name
=
emb_var
.
name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
emb_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
sums
=
[]
for
i
,
emb_file
in
enumerate
(
emb_files
):
var_name
=
"{}_{}"
.
format
(
emb_var
.
name
,
i
)
param_var
=
global_block
.
create_var
(
name
=
var_name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
global_block
.
append_op
(
type
=
'load'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
param_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
lookup_table_dirname
,
var_name
)
})
sums
.
append
(
param_var
)
global_block
.
append_op
(
type
=
'sum'
,
inputs
=
{
"X"
:
sums
},
outputs
=
{
'Out'
:
emb_var
},
attrs
=
{})
global_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
sums
})
executor
.
run
(
convert_program
)
_logger
.
info
(
"Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
lookup_table_vars
=
[
lookup_table_var
]
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
(
lookup_table_vars
),
filename
=
None
)
_load_lookup_table_vars
(
executor
,
dirname
,
program
,
lookup_table_vars
)
_logger
.
info
(
"Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
def
load_inference_model
(
dirname
,
executor
,
lookup_table_var_name
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
local_model
=
os
.
path
.
join
(
dirname
,
model_filename
)
with
open
(
local_model
,
"rb"
)
as
f
:
program_desc_str
=
f
.
read
()
program
=
Program
.
parse_from_string
(
program_desc_str
)
if
not
core
.
_is_program_version_supported
(
program
.
_version
()):
raise
ValueError
(
"Unsupported program version: %d
\n
"
%
program
.
_version
())
# Binary data also need version.
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var_name
)
feed_target_names
=
program
.
desc
.
get_feed_target_names
()
fetch_target_names
=
program
.
desc
.
get_fetch_target_names
()
fetch_targets
=
[
program
.
global_block
().
var
(
name
)
for
name
in
fetch_target_names
]
return
[
program
,
feed_target_names
,
fetch_targets
]
python/paddle/fluid/framework.py
浏览文件 @
7e4bd695
...
@@ -1698,6 +1698,7 @@ class Program(object):
...
@@ -1698,6 +1698,7 @@ class Program(object):
p
.
_copy_param_info_from
(
self
)
p
.
_copy_param_info_from
(
self
)
p
.
_copy_data_info_from
(
self
)
p
.
_copy_data_info_from
(
self
)
p
.
_copy_dist_param_info_from
(
self
)
return
p
return
p
def
_prune
(
self
,
targets
):
def
_prune
(
self
,
targets
):
...
@@ -1938,6 +1939,25 @@ class Program(object):
...
@@ -1938,6 +1939,25 @@ class Program(object):
"program, with represent the same topology"
)
"program, with represent the same topology"
)
self
.
global_block
().
_copy_param_info_from
(
other
.
global_block
())
self
.
global_block
().
_copy_param_info_from
(
other
.
global_block
())
def
_copy_dist_param_info_from
(
self
,
other
):
"""
Copy the information of distributed information from other program.
Args:
other(Program): Other program
Returns:
None
"""
if
not
isinstance
(
other
,
Program
):
raise
TypeError
(
"_copy_dist_param_info_from should be invoked with "
"Program"
)
self
.
_is_distributed
=
other
.
_is_distributed
self
.
_is_chief
=
other
.
_is_chief
self
.
_slice_vars_and_attrs
=
other
.
_slice_vars_and_attrs
self
.
_endpoints
=
other
.
_endpoints
self
.
_distributed_lookup_table
=
other
.
_distributed_lookup_table
def
_copy_data_info_from
(
self
,
other
):
def
_copy_data_info_from
(
self
,
other
):
"""
"""
Copy the information of data variables from other program.
Copy the information of data variables from other program.
...
...
python/paddle/fluid/io.py
浏览文件 @
7e4bd695
...
@@ -165,6 +165,7 @@ def save_vars(executor,
...
@@ -165,6 +165,7 @@ def save_vars(executor,
save_vars
(
save_vars
(
executor
,
executor
,
main_program
=
main_program
,
dirname
=
dirname
,
dirname
=
dirname
,
vars
=
list
(
filter
(
predicate
,
main_program
.
list_vars
())),
vars
=
list
(
filter
(
predicate
,
main_program
.
list_vars
())),
filename
=
filename
)
filename
=
filename
)
...
@@ -172,11 +173,18 @@ def save_vars(executor,
...
@@ -172,11 +173,18 @@ def save_vars(executor,
save_program
=
Program
()
save_program
=
Program
()
save_block
=
save_program
.
global_block
()
save_block
=
save_program
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
save_var_map
=
{}
save_var_map
=
{}
for
each_var
in
vars
:
for
each_var
in
vars
:
# NOTE: don't save the variable which type is RAW
# NOTE: don't save the variable which type is RAW
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
continue
if
each_var
.
name
==
main_program
.
_distributed_lookup_table
:
continue
new_var
=
_clone_var_in_block_
(
save_block
,
each_var
)
new_var
=
_clone_var_in_block_
(
save_block
,
each_var
)
if
filename
is
None
:
if
filename
is
None
:
save_block
.
append_op
(
save_block
.
append_op
(
...
@@ -198,6 +206,16 @@ def save_vars(executor,
...
@@ -198,6 +206,16 @@ def save_vars(executor,
outputs
=
{},
outputs
=
{},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
attrs
=
{}
attrs
[
'epmap'
]
=
main_program
.
_endpoints
attrs
[
'dir'
]
=
lookup_table_filename
attrs
[
'lookup_table'
]
=
main_program
.
_distributed_lookup_table
save_block
.
append_op
(
type
=
'checkpoint_notify'
,
inputs
=
{},
outputs
=
{},
attrs
=
attrs
)
executor
.
run
(
save_program
)
executor
.
run
(
save_program
)
...
@@ -379,11 +397,22 @@ def load_vars(executor,
...
@@ -379,11 +397,22 @@ def load_vars(executor,
load_prog
=
Program
()
load_prog
=
Program
()
load_block
=
load_prog
.
global_block
()
load_block
=
load_prog
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
load_slice_vars
=
[]
for
each_var
in
main_program
.
_slice_vars_and_attrs
:
load_slice_vars
.
append
(
each_var
[
2
].
name
)
load_var_map
=
{}
load_var_map
=
{}
for
each_var
in
vars
:
for
each_var
in
vars
:
assert
isinstance
(
each_var
,
Variable
)
assert
isinstance
(
each_var
,
Variable
)
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
continue
if
each_var
.
name
in
load_slice_vars
:
continue
new_var
=
_clone_var_in_block_
(
load_block
,
each_var
)
new_var
=
_clone_var_in_block_
(
load_block
,
each_var
)
if
filename
is
None
:
if
filename
is
None
:
load_block
.
append_op
(
load_block
.
append_op
(
...
@@ -406,9 +435,6 @@ def load_vars(executor,
...
@@ -406,9 +435,6 @@ def load_vars(executor,
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
executor
.
run
(
load_prog
)
executor
.
run
(
load_prog
)
if
main_program
is
None
:
main_program
=
default_main_program
()
# load slice vars on pserver, if have it.
# load slice vars on pserver, if have it.
_load_slice_up_vars
(
executor
,
dirname
,
_load_slice_up_vars
(
executor
,
dirname
,
main_program
.
_slice_vars_and_attrs
)
main_program
.
_slice_vars_and_attrs
)
...
@@ -618,13 +644,6 @@ def save_inference_model(dirname,
...
@@ -618,13 +644,6 @@ def save_inference_model(dirname,
if
main_program
is
None
:
if
main_program
is
None
:
main_program
=
default_main_program
()
main_program
=
default_main_program
()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
_save_lookup_tables_by_notify
(
executor
,
lookup_table_filename
,
main_program
.
_distributed_lookup_table
,
main_program
.
_endpoints
)
# when a pserver and a trainer running on the same machine, mkdir may conflict
# when a pserver and a trainer running on the same machine, mkdir may conflict
try
:
try
:
os
.
makedirs
(
dirname
)
os
.
makedirs
(
dirname
)
...
@@ -642,6 +661,9 @@ def save_inference_model(dirname,
...
@@ -642,6 +661,9 @@ def save_inference_model(dirname,
# it can only be loaded for inference directly. If it's false, the whole
# it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be
# original program and related meta are saved so that future usage can be
# more flexible.
# more flexible.
origin_program
=
main_program
.
clone
()
if
export_for_deployment
:
if
export_for_deployment
:
main_program
=
main_program
.
clone
()
main_program
=
main_program
.
clone
()
global_block
=
main_program
.
global_block
()
global_block
=
main_program
.
global_block
()
...
@@ -666,8 +688,11 @@ def save_inference_model(dirname,
...
@@ -666,8 +688,11 @@ def save_inference_model(dirname,
with
open
(
model_basename
+
".main_program"
,
"wb"
)
as
f
:
with
open
(
model_basename
+
".main_program"
,
"wb"
)
as
f
:
f
.
write
(
main_program
.
desc
.
serialize_to_string
())
f
.
write
(
main_program
.
desc
.
serialize_to_string
())
main_program
.
_copy_dist_param_info_from
(
origin_program
)
if
params_filename
is
not
None
:
if
params_filename
is
not
None
:
params_filename
=
os
.
path
.
basename
(
params_filename
)
params_filename
=
os
.
path
.
basename
(
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
...
@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
...
@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
slice_var
=
var_tuple
[
2
]
slice_var
=
var_tuple
[
2
]
end
=
start
+
slice_var
.
shape
[
0
]
end
=
start
+
slice_var
.
shape
[
0
]
orig_var_name
=
orig_var
.
name
orig_var
.
name
=
"{}.origin"
.
format
(
orig_var_name
)
clone_orig_var
=
load_block
.
create_var
(
clone_orig_var
=
load_block
.
create_var
(
name
=
orig_var
.
name
,
name
=
orig_var
.
name
,
type
=
orig_var
.
type
,
type
=
orig_var
.
type
,
...
@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
...
@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
type
=
'load'
,
type
=
'load'
,
inputs
=
{},
inputs
=
{},
outputs
=
{
'Out'
:
[
clone_orig_var
]},
outputs
=
{
'Out'
:
[
clone_orig_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
clone_orig_var
.
name
)})
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
orig_var_
name
)})
load_block
.
append_op
(
load_block
.
append_op
(
type
=
"slice"
,
type
=
"slice"
,
inputs
=
{
'Input'
:
clone_orig_var
},
inputs
=
{
'Input'
:
clone_orig_var
},
...
@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
...
@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
'starts'
:
[
start
],
'starts'
:
[
start
],
'ends'
:
[
end
]})
'ends'
:
[
end
]})
need_delete_vars
.
append
(
clone_orig_var
)
need_delete_vars
.
append
(
clone_orig_var
)
load_block
.
append_op
(
load_block
.
append_op
(
type
=
'delete_var'
,
type
=
'delete_var'
,
inputs
=
{
'X'
:
need_delete_vars
},
)
inputs
=
{
'X'
:
need_delete_vars
},
)
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
7e4bd695
...
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
...
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
def
increment
(
x
,
value
=
1.0
,
in_place
=
True
):
def
increment
(
x
,
value
=
1.0
,
in_place
=
True
):
"""
"""
This function performs an operation that increments
each
value in the
This function performs an operation that increments
the
value in the
input :math:`x` by an amount: :math:`value` as mentioned in the input
input :math:`x` by an amount: :math:`value` as mentioned in the input
parameter. This operation is performed in-place by default.
parameter. This operation is performed in-place by default. Notice that
the number of elements in :math:`x` must be equal to 1.
Args:
Args:
x (Variable|list): The tensor that has the input values.
x (Variable|list): The tensor that has the input values.
...
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
...
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
data = fluid.layers.data(name='data', shape=[1], dtype='float32',
append_batch_size=False)
data = fluid.layers.increment(x=data, value=3.0, in_place=True)
data = fluid.layers.increment(x=data, value=3.0, in_place=True)
"""
"""
helper
=
LayerHelper
(
"increment"
,
**
locals
())
helper
=
LayerHelper
(
"increment"
,
**
locals
())
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
7e4bd695
...
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
...
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
clip
=
False
,
clip
=
False
,
steps
=
[
0.0
,
0.0
],
steps
=
[
0.0
,
0.0
],
offset
=
0.5
,
offset
=
0.5
,
flatten_to_2d
=
False
,
name
=
None
):
name
=
None
):
"""
"""
**Density Prior Box Operator**
**Density Prior Box Operator**
...
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
...
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
height/weight of the input will be automatically calculated.
height/weight of the input will be automatically calculated.
Default: [0., 0.]
Default: [0., 0.]
offset(float): Prior boxes center offset. Default: 0.5
offset(float): Prior boxes center offset. Default: 0.5
flatten_to_2d(bool): Whether to flatten output prior boxes and variance
to 2D shape, the second dim is 4. Default: False.
name(str): Name of the density prior box op. Default: None.
name(str): Name of the density prior box op. Default: None.
Returns:
Returns:
tuple: A tuple with two Variable (boxes, variances)
tuple: A tuple with two Variable (boxes, variances)
boxes: the output density prior boxes of PriorBox.
boxes: the output density prior boxes of PriorBox.
The layout is [H, W, num_priors, 4].
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input,
H is the height of input, W is the width of input,
num_priors is the total
num_priors is the total box count of each position of input.
box count of each position of input.
variances: the expanded variances of PriorBox.
variances: the expanded variances of PriorBox.
The layout is [H, W, num_priors, 4].
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input
H is the height of input, W is the width of input
num_priors is the total
num_priors is the total box count of each position of input.
box count of each position of input
Examples:
Examples:
...
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
...
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
box, var = fluid.layers.density_prior_box(
box, var = fluid.layers.density_prior_box(
input=conv1,
input=conv1,
image=images,
image=images,
min_sizes=[100.],
densities=[4, 2, 1],
max_sizes=[200.],
fixed_sizes=[32.0, 64.0, 128.0],
aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
fixed_ratios=[1.],
densities=[3, 4],
clip=True,
fixed_sizes=[50., 60.],
flatten_to_2d=True)
fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
flip=True,
clip=True)
"""
"""
helper
=
LayerHelper
(
"density_prior_box"
,
**
locals
())
helper
=
LayerHelper
(
"density_prior_box"
,
**
locals
())
dtype
=
helper
.
input_dtype
()
dtype
=
helper
.
input_dtype
()
...
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
...
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
'step_w'
:
steps
[
0
],
'step_w'
:
steps
[
0
],
'step_h'
:
steps
[
1
],
'step_h'
:
steps
[
1
],
'offset'
:
offset
,
'offset'
:
offset
,
'densities'
:
densities
,
'fixed_sizes'
:
fixed_sizes
,
'fixed_ratios'
:
fixed_ratios
,
'flatten_to_2d'
:
flatten_to_2d
,
}
}
if
densities
is
not
None
and
len
(
densities
)
>
0
:
attrs
[
'densities'
]
=
densities
if
fixed_sizes
is
not
None
and
len
(
fixed_sizes
)
>
0
:
attrs
[
'fixed_sizes'
]
=
fixed_sizes
if
fixed_ratios
is
not
None
and
len
(
fixed_ratios
)
>
0
:
attrs
[
'fixed_ratios'
]
=
fixed_ratios
box
=
helper
.
create_variable_for_type_inference
(
dtype
)
box
=
helper
.
create_variable_for_type_inference
(
dtype
)
var
=
helper
.
create_variable_for_type_inference
(
dtype
)
var
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
helper
.
append_op
(
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
7e4bd695
...
@@ -2134,11 +2134,16 @@ def pool2d(input,
...
@@ -2134,11 +2134,16 @@ def pool2d(input,
input tensor is NCHW, where N is batch size, C is
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
the number of channels, H is the height of the
feature, and W is the width of the feature.
feature, and W is the width of the feature.
pool_size (int): The side length of pooling windows. All pooling
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
windows are squares with pool_size on a side.
it must contain two integers, (pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be a square of an int.
pool_type: ${pooling_type_comment}
pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
pool_padding (int): padding size.
it must contain two integers, (pool_stride_Height, pool_stride_Width).
Otherwise, the pool stride size will be a square of an int.
pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
Otherwise, the pool padding size will be a square of an int.
global_pooling (bool): ${global_pooling_comment}
global_pooling (bool): ${global_pooling_comment}
use_cudnn (bool): ${use_cudnn_comment}
use_cudnn (bool): ${use_cudnn_comment}
ceil_mode (bool): ${ceil_mode_comment}
ceil_mode (bool): ${ceil_mode_comment}
...
@@ -5916,9 +5921,10 @@ def image_resize(input,
...
@@ -5916,9 +5921,10 @@ def image_resize(input,
raise
ValueError
(
raise
ValueError
(
"The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
"The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
)
)
resample_type
=
resample_methods
[
resample
]
if
out_shape
is
None
and
scale
is
None
:
if
out_shape
is
None
and
scale
is
None
:
raise
ValueError
(
"One of out_shape and scale must not be None."
)
raise
ValueError
(
"One of out_shape and scale must not be None."
)
helper
=
LayerHelper
(
'
interpolate'
,
**
locals
())
helper
=
LayerHelper
(
'
{}_interp'
.
format
(
resample_type
)
,
**
locals
())
dtype
=
helper
.
input_dtype
()
dtype
=
helper
.
input_dtype
()
def
_is_list_or_turple_
(
data
):
def
_is_list_or_turple_
(
data
):
...
@@ -5952,18 +5958,16 @@ def image_resize(input,
...
@@ -5952,18 +5958,16 @@ def image_resize(input,
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
helper
.
append_op
(
type
=
'
interpolate'
,
type
=
'
{}_interp'
.
format
(
resample_type
)
,
inputs
=
inputs
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
out
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
attrs
=
{
"out_h"
:
out_h
,
"out_h"
:
out_h
,
"out_w"
:
out_w
,
"out_w"
:
out_w
,
"interp_method"
:
resample_methods
[
resample
]
"interp_method"
:
resample_type
})
})
return
out
return
out
@
templatedoc
(
op_type
=
"
interpolate
"
)
@
templatedoc
(
op_type
=
"
bilinear_interp
"
)
def
resize_bilinear
(
input
,
def
resize_bilinear
(
input
,
out_shape
=
None
,
out_shape
=
None
,
scale
=
None
,
scale
=
None
,
...
@@ -6019,7 +6023,7 @@ def resize_bilinear(input,
...
@@ -6019,7 +6023,7 @@ def resize_bilinear(input,
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
,
actual_shape
)
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
,
actual_shape
)
@
templatedoc
(
op_type
=
"
interpolate
"
)
@
templatedoc
(
op_type
=
"
nearest_interp
"
)
def
resize_nearest
(
input
,
def
resize_nearest
(
input
,
out_shape
=
None
,
out_shape
=
None
,
scale
=
None
,
scale
=
None
,
...
@@ -7018,14 +7022,14 @@ def prelu(x, mode, param_attr=None, name=None):
...
@@ -7018,14 +7022,14 @@ def prelu(x, mode, param_attr=None, name=None):
"""
"""
Equation:
Equation:
y = \max(0, x) + alpha \min(0, x)
y = \max(0, x) + alpha
*
\min(0, x)
Args:
Args:
x (Variable): The input tensor.
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
weight (alpha).
mode (string): The mode for weight sharing
mode (string): The mode for weight sharing
. It supports all, channel
all: all elements share same weight
and element.
all: all elements share same weight
channel:elements in a channel share same weight
channel:elements in a channel share same weight
element:each element has a weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
name(str|None): A name for this layer(optional). If set None, the layer
...
...
python/paddle/fluid/metrics.py
浏览文件 @
7e4bd695
...
@@ -46,8 +46,8 @@ def _is_numpy_(var):
...
@@ -46,8 +46,8 @@ def _is_numpy_(var):
def
_is_number_
(
var
):
def
_is_number_
(
var
):
return
isinstance
(
var
,
int
)
or
isinstance
(
var
,
float
)
or
(
isinstance
(
return
isinstance
(
var
,
int
)
or
isinstance
(
var
,
np
.
int64
)
or
isinstance
(
var
,
np
.
ndarray
)
and
var
.
shape
==
(
1
,
))
var
,
float
)
or
(
isinstance
(
var
,
np
.
ndarray
)
and
var
.
shape
==
(
1
,
))
def
_is_number_or_matrix_
(
var
):
def
_is_number_or_matrix_
(
var
):
...
...
python/paddle/fluid/nets.py
浏览文件 @
7e4bd695
...
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
...
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
filter_size
,
filter_size
,
param_attr
=
None
,
param_attr
=
None
,
act
=
"sigmoid"
,
act
=
"sigmoid"
,
pool_type
=
"max"
):
pool_type
=
"max"
,
bias_attr
=
None
):
"""
"""
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
...
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
...
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
Default :math:`max`.
Default :math:`max`.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
Return:
Return:
Variable: The final result after Sequence Convolution and Pooling.
Variable: The final result after Sequence Convolution and Pooling.
...
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
...
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
param_attr
=
param_attr
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
act
=
act
)
pool_out
=
layers
.
sequence_pool
(
input
=
conv_out
,
pool_type
=
pool_type
)
pool_out
=
layers
.
sequence_pool
(
input
=
conv_out
,
pool_type
=
pool_type
)
...
...
python/paddle/fluid/tests/book/test_image_classification.py
浏览文件 @
7e4bd695
...
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
...
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
assert
len
(
results
[
0
])
==
len
(
transpiler_results
[
0
])
assert
len
(
results
[
0
])
==
len
(
transpiler_results
[
0
])
for
i
in
range
(
len
(
results
[
0
])):
for
i
in
range
(
len
(
results
[
0
])):
np
.
testing
.
assert_almost_equal
(
np
.
testing
.
assert_almost_equal
(
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
5
)
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
4
)
print
(
"infer results: "
,
results
[
0
])
print
(
"infer results: "
,
results
[
0
])
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
7e4bd695
...
@@ -112,6 +112,8 @@ class TestDetection(unittest.TestCase):
...
@@ -112,6 +112,8 @@ class TestDetection(unittest.TestCase):
class
TestPriorBox
(
unittest
.
TestCase
):
class
TestPriorBox
(
unittest
.
TestCase
):
def
test_prior_box
(
self
):
def
test_prior_box
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
...
@@ -130,6 +132,8 @@ class TestPriorBox(unittest.TestCase):
...
@@ -130,6 +132,8 @@ class TestPriorBox(unittest.TestCase):
class
TestDensityPriorBox
(
unittest
.
TestCase
):
class
TestDensityPriorBox
(
unittest
.
TestCase
):
def
test_density_prior_box
(
self
):
def
test_density_prior_box
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
...
@@ -143,7 +147,7 @@ class TestDensityPriorBox(unittest.TestCase):
...
@@ -143,7 +147,7 @@ class TestDensityPriorBox(unittest.TestCase):
clip
=
True
)
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
3
]
==
4
assert
box
.
shape
[
-
1
]
==
4
class
TestAnchorGenerator
(
unittest
.
TestCase
):
class
TestAnchorGenerator
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
7e4bd695
...
@@ -81,25 +81,27 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
...
@@ -81,25 +81,27 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list
(
REMOVE_ITEM TEST_OPS test_dist_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_dist_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_image_classification_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_image_classification_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_interpolate_op
)
list
(
REMOVE_ITEM TEST_OPS test_bilinear_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_nearest_interp_op
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
endforeach
(
TEST_OP
)
endforeach
(
TEST_OP
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_interpolate_op MODULES test_interpolate_op SERIAL
)
py_test_modules
(
test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL
)
py_test_modules
(
test_nearest_interp_op MODULES test_nearest_interp_op SERIAL
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
if
(
NOT APPLE
)
if
(
NOT APPLE
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT 200
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext
)
# FIXME(typhoonzero): add these tests back
set_tests_properties
(
test_dist_se_resnext PROPERTIES TIMEOUT 1000
)
# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext
)
# FIXME(typhoonzero): add this back
# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
#
py_test_modules(test_dist_transformer MODULES test_dist_transformer)
#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
#
set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
# TODO(typhoonzero): make dist test parallel when fix port management issue
# TODO(typhoonzero): make dist test parallel when fix port management issue
set_tests_properties
(
test_dist_mnist test_dist_word2vec test_dist_
se_resnext test_dist_
ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE
)
set_tests_properties
(
test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE
)
endif
(
NOT APPLE
)
endif
(
NOT APPLE
)
py_test_modules
(
test_dist_transpiler MODULES test_dist_transpiler
)
py_test_modules
(
test_dist_transpiler MODULES test_dist_transpiler
)
endif
()
endif
()
...
...
python/paddle/fluid/tests/unittests/test_
interpolate
_op.py
→
python/paddle/fluid/tests/unittests/test_
bilinear_interp
_op.py
浏览文件 @
7e4bd695
...
@@ -20,36 +20,6 @@ from op_test import OpTest
...
@@ -20,36 +20,6 @@ from op_test import OpTest
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
def
nearest_neighbor_interp_np
(
X
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
"""nearest neighbor interpolation implement in shape [N, C, H, W]"""
if
out_size
is
not
None
:
out_h
=
out_size
[
0
]
out_w
=
out_size
[
1
]
if
actual_shape
is
not
None
:
out_h
=
actual_shape
[
0
]
out_w
=
actual_shape
[
1
]
n
,
c
,
in_h
,
in_w
=
X
.
shape
ratio_h
=
ratio_w
=
0.0
if
out_h
>
1
:
ratio_h
=
(
in_h
-
1.0
)
/
(
out_h
-
1.0
)
if
out_w
>
1
:
ratio_w
=
(
in_w
-
1.0
)
/
(
out_w
-
1.0
)
out
=
np
.
zeros
((
n
,
c
,
out_h
,
out_w
))
for
i
in
range
(
out_h
):
in_i
=
int
(
ratio_h
*
i
+
0.5
)
for
j
in
range
(
out_w
):
in_j
=
int
(
ratio_w
*
j
+
0.5
)
out
[:,
:,
i
,
j
]
=
X
[:,
:,
in_i
,
in_j
]
return
out
.
astype
(
X
.
dtype
)
def
bilinear_interp_np
(
input
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
def
bilinear_interp_np
(
input
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
"""bilinear interpolation implement in shape [N, C, H, W]"""
"""bilinear interpolation implement in shape [N, C, H, W]"""
if
out_size
is
not
None
:
if
out_size
is
not
None
:
...
@@ -87,22 +57,16 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
...
@@ -87,22 +57,16 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
return
out
.
astype
(
input
.
dtype
)
return
out
.
astype
(
input
.
dtype
)
INTERPOLATE_FUNCS
=
{
class
TestBilinearInterpOp
(
OpTest
):
'bilinear'
:
bilinear_interp_np
,
'nearest'
:
nearest_neighbor_interp_np
,
}
class
TestInterpolateOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
init_test_case
()
self
.
op_type
=
"
interpolate
"
self
.
op_type
=
"
bilinear_interp
"
input_np
=
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
)
input_np
=
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
)
output_np
=
INTERPOLATE_FUNCS
[
self
.
interp_method
](
output_np
=
bilinear_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shape
)
self
.
out_size
,
self
.
actual_shape
)
self
.
inputs
=
{
'X'
:
input_np
}
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
...
@@ -129,7 +93,7 @@ class TestInterpolateOp(OpTest):
...
@@ -129,7 +93,7 @@ class TestInterpolateOp(OpTest):
self
.
out_size
=
np
.
array
([
3
,
3
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
3
,
3
]).
astype
(
"int32"
)
class
TestBilinearInterpCase1
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase1
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
...
@@ -137,7 +101,7 @@ class TestBilinearInterpCase1(TestInterpolateOp):
...
@@ -137,7 +101,7 @@ class TestBilinearInterpCase1(TestInterpolateOp):
self
.
out_w
=
1
self
.
out_w
=
1
class
TestBilinearInterpCase2
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase2
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
...
@@ -145,7 +109,7 @@ class TestBilinearInterpCase2(TestInterpolateOp):
...
@@ -145,7 +109,7 @@ class TestBilinearInterpCase2(TestInterpolateOp):
self
.
out_w
=
12
self
.
out_w
=
12
class
TestBilinearInterpCase3
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase3
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
...
@@ -153,7 +117,7 @@ class TestBilinearInterpCase3(TestInterpolateOp):
...
@@ -153,7 +117,7 @@ class TestBilinearInterpCase3(TestInterpolateOp):
self
.
out_w
=
128
self
.
out_w
=
128
class
TestBilinearInterpCase4
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase4
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
...
@@ -162,7 +126,7 @@ class TestBilinearInterpCase4(TestInterpolateOp):
...
@@ -162,7 +126,7 @@ class TestBilinearInterpCase4(TestInterpolateOp):
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
class
TestBilinearInterpCase5
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase5
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
...
@@ -171,7 +135,7 @@ class TestBilinearInterpCase5(TestInterpolateOp):
...
@@ -171,7 +135,7 @@ class TestBilinearInterpCase5(TestInterpolateOp):
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
class
TestBilinearInterpCase6
(
Test
Interpolate
Op
):
class
TestBilinearInterpCase6
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
...
@@ -180,7 +144,7 @@ class TestBilinearInterpCase6(TestInterpolateOp):
...
@@ -180,7 +144,7 @@ class TestBilinearInterpCase6(TestInterpolateOp):
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
class
TestBilinearInterpActualShape
(
Test
Interpolate
Op
):
class
TestBilinearInterpActualShape
(
Test
BilinearInterp
Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
...
@@ -189,25 +153,16 @@ class TestBilinearInterpActualShape(TestInterpolateOp):
...
@@ -189,25 +153,16 @@ class TestBilinearInterpActualShape(TestInterpolateOp):
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
class
TestBilinearInterpBigScale
(
TestInterpolateOp
):
class
TestBilinearInterpOpUint8
(
OpTest
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
4
,
64
,
32
]
self
.
out_h
=
100
self
.
out_w
=
50
self
.
out_size
=
np
.
array
([
101
,
51
]).
astype
(
'int32'
)
class
TestInterpolateOpUint8
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
init_test_case
()
self
.
op_type
=
"
interpolate
"
self
.
op_type
=
"
bilinear_interp
"
input_np
=
np
.
random
.
randint
(
input_np
=
np
.
random
.
randint
(
low
=
0
,
high
=
256
,
size
=
self
.
input_shape
).
astype
(
"uint8"
)
low
=
0
,
high
=
256
,
size
=
self
.
input_shape
).
astype
(
"uint8"
)
output_np
=
INTERPOLATE_FUNCS
[
self
.
interp_method
](
output_np
=
bilinear_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shape
)
self
.
out_size
,
self
.
actual_shape
)
self
.
inputs
=
{
'X'
:
input_np
}
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
...
@@ -228,7 +183,7 @@ class TestInterpolateOpUint8(OpTest):
...
@@ -228,7 +183,7 @@ class TestInterpolateOpUint8(OpTest):
self
.
out_w
=
9
self
.
out_w
=
9
class
TestBilinearInterpCase1Uint8
(
Test
Interpolate
OpUint8
):
class
TestBilinearInterpCase1Uint8
(
Test
BilinearInterp
OpUint8
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
...
@@ -236,7 +191,7 @@ class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
...
@@ -236,7 +191,7 @@ class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
self
.
out_w
=
50
self
.
out_w
=
50
class
TestBilinearInterpCase2Uint8
(
Test
Interpolate
OpUint8
):
class
TestBilinearInterpCase2Uint8
(
Test
BilinearInterp
OpUint8
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
...
@@ -245,91 +200,5 @@ class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
...
@@ -245,91 +200,5 @@ class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
self
.
out_size
=
np
.
array
([
6
,
15
]).
astype
(
"int32"
)
self
.
out_size
=
np
.
array
([
6
,
15
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase1
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
class
TestNearestNeighborInterpCase2
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
class
TestNearestNeighborInterpCase3
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
class
TestNearestNeighborInterpCase4
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase5
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase6
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpActualShape
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
self
.
out_h
=
64
self
.
out_w
=
32
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpBigScale
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
4
,
64
,
32
]
self
.
out_h
=
100
self
.
out_w
=
50
self
.
out_size
=
np
.
array
([
101
,
51
]).
astype
(
'int32'
)
class
TestNearestNeighborInterpCase1Uint8
(
TestInterpolateOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
self
.
out_h
=
120
self
.
out_w
=
50
class
TestNearestNeighborInterpCase2Uint8
(
TestInterpolateOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
5
self
.
out_w
=
13
self
.
out_size
=
np
.
array
([
6
,
15
]).
astype
(
"int32"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
浏览文件 @
7e4bd695
...
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
...
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
'offset'
:
self
.
offset
,
'offset'
:
self
.
offset
,
'densities'
:
self
.
densities
,
'densities'
:
self
.
densities
,
'fixed_sizes'
:
self
.
fixed_sizes
,
'fixed_sizes'
:
self
.
fixed_sizes
,
'fixed_ratios'
:
self
.
fixed_ratios
'fixed_ratios'
:
self
.
fixed_ratios
,
'flatten_to_2d'
:
self
.
flatten_to_2d
}
}
self
.
outputs
=
{
'Boxes'
:
self
.
out_boxes
,
'Variances'
:
self
.
out_var
}
self
.
outputs
=
{
'Boxes'
:
self
.
out_boxes
,
'Variances'
:
self
.
out_var
}
...
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
...
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
self
.
set_data
()
self
.
set_data
()
def
set_density
(
self
):
def
set_density
(
self
):
self
.
densities
=
[]
self
.
densities
=
[
4
,
2
,
1
]
self
.
fixed_sizes
=
[]
self
.
fixed_sizes
=
[
32.0
,
64.0
,
128.0
]
self
.
fixed_ratios
=
[]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
17
self
.
layer_h
=
17
self
.
image_w
=
533
self
.
image_h
=
533
self
.
flatten_to_2d
=
False
def
init_test_params
(
self
):
def
init_test_params
(
self
):
self
.
layer_w
=
32
self
.
set_density
()
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
step_w
=
float
(
self
.
image_w
)
/
float
(
self
.
layer_w
)
self
.
step_w
=
float
(
self
.
image_w
)
/
float
(
self
.
layer_w
)
self
.
step_h
=
float
(
self
.
image_h
)
/
float
(
self
.
layer_h
)
self
.
step_h
=
float
(
self
.
image_h
)
/
float
(
self
.
layer_h
)
...
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
...
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
self
.
variances
=
[
0.1
,
0.1
,
0.2
,
0.2
]
self
.
variances
=
[
0.1
,
0.1
,
0.2
,
0.2
]
self
.
variances
=
np
.
array
(
self
.
variances
,
dtype
=
np
.
float
).
flatten
()
self
.
variances
=
np
.
array
(
self
.
variances
,
dtype
=
np
.
float
).
flatten
()
self
.
set_density
()
self
.
clip
=
True
self
.
clip
=
True
self
.
num_priors
=
0
self
.
num_priors
=
0
if
len
(
self
.
fixed_sizes
)
>
0
and
len
(
self
.
densities
)
>
0
:
if
len
(
self
.
fixed_sizes
)
>
0
and
len
(
self
.
densities
)
>
0
:
...
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
...
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
(
self
.
layer_h
,
self
.
layer_w
,
self
.
num_priors
,
1
))
(
self
.
layer_h
,
self
.
layer_w
,
self
.
num_priors
,
1
))
self
.
out_boxes
=
out_boxes
.
astype
(
'float32'
)
self
.
out_boxes
=
out_boxes
.
astype
(
'float32'
)
self
.
out_var
=
out_var
.
astype
(
'float32'
)
self
.
out_var
=
out_var
.
astype
(
'float32'
)
if
self
.
flatten_to_2d
:
self
.
out_boxes
=
self
.
out_boxes
.
reshape
((
-
1
,
4
))
self
.
out_var
=
self
.
out_var
.
reshape
((
-
1
,
4
))
class
TestDensityPriorBox
(
TestDensityPriorBoxOp
):
class
TestDensityPriorBox
(
TestDensityPriorBoxOp
):
...
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
...
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
self
.
densities
=
[
3
,
4
]
self
.
densities
=
[
3
,
4
]
self
.
fixed_sizes
=
[
1.0
,
2.0
]
self
.
fixed_sizes
=
[
1.0
,
2.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
32
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
flatten_to_2d
=
True
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
0 → 100644
浏览文件 @
7e4bd695
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
class
TestFusionTransposeFlattenConcationOp
(
OpTest
):
def
setUp
(
self
):
self
.
init_test_case
()
self
.
op_type
=
"fusion_transpose_flatten_concat"
ins
=
[]
flats
=
[]
for
i
in
range
(
len
(
self
.
shapes
)):
in_shape
=
self
.
shapes
[
i
]
a
=
np
.
random
.
random
(
in_shape
).
astype
(
"float32"
)
ins
.
append
((
"x%d"
%
i
,
a
))
b
=
a
.
transpose
(
self
.
trans_axis
)
flat_shape
=
(
np
.
prod
(
b
.
shape
[:
self
.
flatten_axis
]),
np
.
prod
(
b
.
shape
[
self
.
flatten_axis
:]))
c
=
b
.
reshape
(
flat_shape
)
flats
.
append
(
c
)
out
=
np
.
concatenate
(
flats
,
axis
=
self
.
concat_axis
)
self
.
inputs
=
{
'X'
:
ins
}
self
.
attrs
=
{
'trans_axis'
:
list
(
self
.
trans_axis
),
'flatten_axis'
:
self
.
flatten_axis
,
'concat_axis'
:
self
.
concat_axis
}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
1e-6
)
else
:
pass
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
17
,
17
),
(
3
,
8
,
7
,
7
),
(
3
,
12
,
5
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase1
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
18
,
17
),
(
3
,
8
,
18
,
7
),
(
6
,
12
,
9
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
1
class
TestCase2
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
0
class
TestCase3
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
3
,
2
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase4
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
),
(
8
,
3
,
9
,
17
),
(
4
,
6
,
9
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
3
)
self
.
flatten_axis
=
3
self
.
concat_axis
=
1
class
TestCase5
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
,
2
),
(
3
,
8
,
2
,
17
,
9
),
(
3
,
17
,
9
,
8
,
2
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
4
,
3
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
7e4bd695
...
@@ -219,6 +219,17 @@ class TestBook(unittest.TestCase):
...
@@ -219,6 +219,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
layers
.
sequence_unpad
(
x
=
x
,
length
=
length
))
self
.
assertIsNotNone
(
layers
.
sequence_unpad
(
x
=
x
,
length
=
length
))
print
(
str
(
program
))
print
(
str
(
program
))
def
test_pool2d
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
self
.
assertIsNotNone
(
layers
.
pool2d
(
x
,
pool_size
=
[
5
,
3
],
pool_stride
=
[
1
,
2
],
pool_padding
=
(
2
,
1
)))
def
test_lstm_unit
(
self
):
def
test_lstm_unit
(
self
):
program
=
Program
()
program
=
Program
()
with
program_guard
(
program
):
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
浏览文件 @
7e4bd695
...
@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
...
@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
lod
.
append
(
nmsed_num
)
lod
.
append
(
nmsed_num
)
if
nmsed_num
==
0
:
continue
if
nmsed_num
==
0
:
continue
tmp_det_out
=
[]
for
c
,
indices
in
nmsed_outs
.
items
():
for
c
,
indices
in
nmsed_outs
.
items
():
for
idx
in
indices
:
for
idx
in
indices
:
xmin
,
ymin
,
xmax
,
ymax
=
boxes
[
n
][
idx
][:]
xmin
,
ymin
,
xmax
,
ymax
=
boxes
[
n
][
idx
][:]
det_outs
.
append
([
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
tmp_det_out
.
append
(
[
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
sorted_det_out
=
sorted
(
tmp_det_out
,
key
=
lambda
tup
:
tup
[
0
],
reverse
=
False
)
det_outs
.
extend
(
sorted_det_out
)
return
det_outs
,
lod
return
det_outs
,
lod
...
...
python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
0 → 100644
浏览文件 @
7e4bd695
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
def
nearest_neighbor_interp_np
(
X
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
"""nearest neighbor interpolation implement in shape [N, C, H, W]"""
if
out_size
is
not
None
:
out_h
=
out_size
[
0
]
out_w
=
out_size
[
1
]
if
actual_shape
is
not
None
:
out_h
=
actual_shape
[
0
]
out_w
=
actual_shape
[
1
]
n
,
c
,
in_h
,
in_w
=
X
.
shape
ratio_h
=
ratio_w
=
0.0
if
out_h
>
1
:
ratio_h
=
(
in_h
-
1.0
)
/
(
out_h
-
1.0
)
if
out_w
>
1
:
ratio_w
=
(
in_w
-
1.0
)
/
(
out_w
-
1.0
)
out
=
np
.
zeros
((
n
,
c
,
out_h
,
out_w
))
for
i
in
range
(
out_h
):
in_i
=
int
(
ratio_h
*
i
+
0.5
)
for
j
in
range
(
out_w
):
in_j
=
int
(
ratio_w
*
j
+
0.5
)
out
[:,
:,
i
,
j
]
=
X
[:,
:,
in_i
,
in_j
]
return
out
.
astype
(
X
.
dtype
)
class
TestNearestInterpOp
(
OpTest
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
op_type
=
"nearest_interp"
input_np
=
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
)
output_np
=
nearest_neighbor_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shape
)
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
if
self
.
actual_shape
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
actual_shape
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
,
'interp_method'
:
self
.
interp_method
}
self
.
outputs
=
{
'Out'
:
output_np
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
,
in_place
=
True
)
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
2
,
3
,
4
,
4
]
self
.
out_h
=
2
self
.
out_w
=
2
self
.
out_size
=
np
.
array
([
3
,
3
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase1
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
class
TestNearestNeighborInterpCase2
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
class
TestNearestNeighborInterpCase3
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
class
TestNearestNeighborInterpCase4
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase5
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase6
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpActualShape
(
TestNearestInterpOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
self
.
out_h
=
64
self
.
out_w
=
32
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
class
TestNearestInterpOpUint8
(
OpTest
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
op_type
=
"nearest_interp"
input_np
=
np
.
random
.
randint
(
low
=
0
,
high
=
256
,
size
=
self
.
input_shape
).
astype
(
"uint8"
)
output_np
=
nearest_neighbor_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shape
)
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
,
'interp_method'
:
self
.
interp_method
}
self
.
outputs
=
{
'Out'
:
output_np
}
def
test_check_output
(
self
):
self
.
check_output_with_place
(
place
=
core
.
CPUPlace
(),
atol
=
1
)
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
3
,
9
,
6
]
self
.
out_h
=
10
self
.
out_w
=
9
class
TestNearestNeighborInterpCase1Uint8
(
TestNearestInterpOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
self
.
out_h
=
120
self
.
out_w
=
50
class
TestNearestNeighborInterpCase2Uint8
(
TestNearestInterpOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
5
self
.
out_w
=
13
self
.
out_size
=
np
.
array
([
6
,
15
]).
astype
(
"int32"
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
7e4bd695
...
@@ -644,6 +644,9 @@ in a single call.")
...
@@ -644,6 +644,9 @@ in a single call.")
else
:
else
:
recv_inputs
.
append
(
single_trainer_var
)
recv_inputs
.
append
(
single_trainer_var
)
self
.
_slice_params_and_optimizes
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
# step 3
# step 3
# Create a union-find data structure from optimize ops,
# Create a union-find data structure from optimize ops,
# If two ops are connected, we could add these two ops
# If two ops are connected, we could add these two ops
...
@@ -766,7 +769,7 @@ in a single call.")
...
@@ -766,7 +769,7 @@ in a single call.")
grad_to_block_id
,
merged_var
,
grad_to_block_id
,
merged_var
,
lr_ops
)
lr_ops
)
# dedup grad to ids list
# dedup grad to ids list
grad_to_block_id
=
list
(
set
(
grad_to_block_id
))
grad_to_block_id
=
list
(
set
(
grad_to_block_id
))
# append global ops
# append global ops
if
global_ops
:
if
global_ops
:
...
@@ -827,8 +830,8 @@ in a single call.")
...
@@ -827,8 +830,8 @@ in a single call.")
attrs
=
attrs
)
attrs
=
attrs
)
# add distributed attrs
# add distributed attrs
pserver_program
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
pserver_program
.
_slice_vars_and_attrs
=
list
(
endpoint
)
self
.
_slice_params_and_optimizes
.
values
()
)
pserver_program
.
_sync_with_cpp
()
pserver_program
.
_sync_with_cpp
()
# save pserver program to generate pserver side startup relatively.
# save pserver program to generate pserver side startup relatively.
...
@@ -941,12 +944,12 @@ to transpile() call.")
...
@@ -941,12 +944,12 @@ to transpile() call.")
outputs
=
{
"Out"
:
startup_tmpvar
})
outputs
=
{
"Out"
:
startup_tmpvar
})
# add slice vars
# add slice vars
s_prog
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
s_prog
.
_slice_vars_and_attrs
=
pserver_program
.
_slice_vars_and_attrs
return
s_prog
return
s_prog
def
_get_slice_vars_and_attrs
(
self
,
endpoint
):
def
_get_slice_vars_and_attrs
(
self
,
endpoint
):
slice_vars_and_attrs
=
[]
slice_vars_and_attrs
=
{}
block_suffix
=
"block"
block_suffix
=
"block"
for
param
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
for
param
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
orig_var_name
,
block_name
,
_
=
self
.
_get_varname_parts
(
param
.
name
)
orig_var_name
,
block_name
,
_
=
self
.
_get_varname_parts
(
param
.
name
)
...
@@ -960,8 +963,7 @@ to transpile() call.")
...
@@ -960,8 +963,7 @@ to transpile() call.")
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
for
slice_var
in
slice_vars
[:
block_idx
]:
for
slice_var
in
slice_vars
[:
block_idx
]:
skip_dim0
+=
slice_var
.
shape
[
0
]
skip_dim0
+=
slice_var
.
shape
[
0
]
slice_vars_and_attrs
.
append
([
orig_var
,
skip_dim0
,
param
])
slice_vars_and_attrs
[
param
.
name
]
=
[
orig_var
,
skip_dim0
,
param
]
return
slice_vars_and_attrs
return
slice_vars_and_attrs
# ====================== private transpiler functions =====================
# ====================== private transpiler functions =====================
...
@@ -1662,10 +1664,10 @@ to transpile() call.")
...
@@ -1662,10 +1664,10 @@ to transpile() call.")
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
continue
continue
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
param_var
=
new_inputs
[
"Param"
]
# update accumulator variable shape
# update accumulator variable shape
param_shape
=
new_inputs
[
"Param"
].
shape
new_shape
=
self
.
_get_optimizer_input_shape
(
new_shape
=
self
.
_get_optimizer_input_shape
(
opt_op
.
type
,
key
,
opt_op
.
type
,
key
,
var
.
shape
,
param_var
.
shape
)
var
.
shape
,
param_shape
)
tmpvar
=
pserver_block
.
create_var
(
tmpvar
=
pserver_block
.
create_var
(
name
=
var
.
name
,
name
=
var
.
name
,
persistable
=
var
.
persistable
,
persistable
=
var
.
persistable
,
...
@@ -1673,6 +1675,13 @@ to transpile() call.")
...
@@ -1673,6 +1675,13 @@ to transpile() call.")
shape
=
new_shape
)
shape
=
new_shape
)
new_inputs
[
key
]
=
tmpvar
new_inputs
[
key
]
=
tmpvar
# var shape been changed
if
new_shape
!=
var
.
shape
:
slice_var_args
=
self
.
_slice_params_and_optimizes
[
param_var
.
name
]
self
.
_slice_params_and_optimizes
[
var
.
name
]
=
[
var
,
slice_var_args
[
1
],
tmpvar
]
# change output's ParamOut variable
# change output's ParamOut variable
outputs
=
self
.
_get_output_map_from_op
(
outputs
=
self
.
_get_output_map_from_op
(
self
.
origin_program
.
global_block
().
vars
,
opt_op
)
self
.
origin_program
.
global_block
().
vars
,
opt_op
)
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
7e4bd695
...
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
...
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
COPY build_scripts /build_scripts
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
bash build_scripts/install_nccl2.sh && rm -r
f
build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
...
...
tools/manylinux1/build_scripts/build_utils.sh
浏览文件 @
7e4bd695
...
@@ -50,6 +50,15 @@ function do_cpython_build {
...
@@ -50,6 +50,15 @@ function do_cpython_build {
mkdir
-p
${
prefix
}
/lib
mkdir
-p
${
prefix
}
/lib
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
if
[
$(
lex_pyver
$py_ver
)
-eq
$(
lex_pyver 3.6
)
]
;
then
wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
tar
-zxf
sqlite-autoconf-3250300.tar.gz
cd
sqlite-autoconf-3250300
./configure
--prefix
=
/usr/local
make
-j8
&&
make
install
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
fi
# NOTE --enable-shared for generating libpython shared library needed for
# NOTE --enable-shared for generating libpython shared library needed for
# linking of some of the nupic.core test executables.
# linking of some of the nupic.core test executables.
if
[
$(
lex_pyver
$py_ver
)
-ge
$(
lex_pyver 3.7
)
]
;
then
if
[
$(
lex_pyver
$py_ver
)
-ge
$(
lex_pyver 3.7
)
]
;
then
...
@@ -59,9 +68,9 @@ function do_cpython_build {
...
@@ -59,9 +68,9 @@ function do_cpython_build {
make
-j8
>
/dev/null
make
-j8
>
/dev/null
make altinstall
>
/dev/null
make altinstall
>
/dev/null
else
else
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
make
-j8
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
-j8
>
/dev/null
make
install
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
install
>
/dev/null
fi
fi
popd
popd
echo
"ZZZ looking for libpython"
echo
"ZZZ looking for libpython"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录