Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b5c44fd4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b5c44fd4
编写于
11月 26, 2018
作者:
Z
Zhang, Guoming
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into prv-calibration
上级
2952d995
840c1b29
变更
175
显示空白变更内容
内联
并排
Showing
175 changed file
with
6016 addition
and
1583 deletion
+6016
-1583
.github/ISSUE_TEMPLATE/---feature-request-.md
.github/ISSUE_TEMPLATE/---feature-request-.md
+27
-0
.github/ISSUE_TEMPLATE/---inference-issue-.md
.github/ISSUE_TEMPLATE/---inference-issue-.md
+40
-0
.github/ISSUE_TEMPLATE/---installation-issue-.md
.github/ISSUE_TEMPLATE/---installation-issue-.md
+40
-0
.github/ISSUE_TEMPLATE/---model-issue-.md
.github/ISSUE_TEMPLATE/---model-issue-.md
+36
-0
.github/ISSUE_TEMPLATE/---others-.md
.github/ISSUE_TEMPLATE/---others-.md
+33
-0
.github/ISSUE_TEMPLATE/---training-issue-.md
.github/ISSUE_TEMPLATE/---training-issue-.md
+38
-0
.gitignore
.gitignore
+1
-0
AUTHORS.md
AUTHORS.md
+2
-0
CMakeLists.txt
CMakeLists.txt
+28
-10
Dockerfile
Dockerfile
+39
-0
cmake/external/dlpack.cmake
cmake/external/dlpack.cmake
+31
-0
cmake/external/eigen.cmake
cmake/external/eigen.cmake
+1
-1
cmake/external/gtest.cmake
cmake/external/gtest.cmake
+4
-0
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+1
-1
cmake/external/rocprim.cmake
cmake/external/rocprim.cmake
+44
-0
cmake/external/snappy.cmake
cmake/external/snappy.cmake
+10
-2
cmake/external/snappystream.cmake
cmake/external/snappystream.cmake
+35
-26
cmake/flags.cmake
cmake/flags.cmake
+3
-0
cmake/generic.cmake
cmake/generic.cmake
+18
-11
cmake/hip.cmake
cmake/hip.cmake
+27
-5
cmake/operators.cmake
cmake/operators.cmake
+1
-3
cmake/simd.cmake
cmake/simd.cmake
+38
-35
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/CMakeLists.txt
paddle/fluid/CMakeLists.txt
+1
-5
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+6
-10
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+1
-1
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+127
-0
paddle/fluid/framework/dlpack_tensor.h
paddle/fluid/framework/dlpack_tensor.h
+45
-0
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+113
-0
paddle/fluid/framework/eigen.h
paddle/fluid/framework/eigen.h
+0
-5
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+1
-0
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+1
-0
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+6
-0
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+0
-5
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+17
-34
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+5
-2
paddle/fluid/framework/transfer_scope_cache.cc
paddle/fluid/framework/transfer_scope_cache.cc
+72
-0
paddle/fluid/framework/transfer_scope_cache.h
paddle/fluid/framework/transfer_scope_cache.h
+41
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+5
-4
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+2
-0
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+1
-0
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+2
-0
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+1
-1
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+2
-1
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
...le/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+18
-6
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+5
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+4
-5
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+2
-0
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+12
-4
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-2
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+0
-6
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+0
-2
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+1
-1
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+13
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+5
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+10
-7
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+46
-24
paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+95
-0
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+1
-1
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+89
-53
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+3
-11
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
...e/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+56
-22
paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
...le/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+48
-0
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+9
-9
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+9
-7
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+75
-13
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+1
-1
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+3
-2
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+2
-2
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+4
-1
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+64
-0
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+111
-0
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
.../fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+138
-0
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
...e/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+87
-0
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+2
-0
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+2
-0
paddle/fluid/inference/tensorrt/plugin/serialize.h
paddle/fluid/inference/tensorrt/plugin/serialize.h
+24
-8
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+139
-42
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+40
-38
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+14
-14
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+51
-21
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+14
-10
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+1
-0
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+2
-0
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+39
-20
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+0
-2
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+2
-0
paddle/fluid/inference/utils/benchmark.cc
paddle/fluid/inference/utils/benchmark.cc
+49
-0
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+52
-0
paddle/fluid/inference/utils/benchmark_tester.cc
paddle/fluid/inference/utils/benchmark_tester.cc
+39
-0
paddle/fluid/memory/allocation/best_fit_allocator_test.cc
paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+1
-0
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+1
-0
paddle/fluid/memory/allocation/cpu_allocator.h
paddle/fluid/memory/allocation/cpu_allocator.h
+6
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+20
-12
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+4
-0
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+1
-1
paddle/fluid/operators/detection/density_prior_box_op.cc
paddle/fluid/operators/detection/density_prior_box_op.cc
+21
-15
paddle/fluid/operators/detection/density_prior_box_op.cu
paddle/fluid/operators/detection/density_prior_box_op.cu
+170
-0
paddle/fluid/operators/detection/density_prior_box_op.h
paddle/fluid/operators/detection/density_prior_box_op.h
+35
-38
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+5
-0
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+20
-0
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+2
-0
paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
.../fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+201
-0
paddle/fluid/operators/elementwise/elementwise_op.h
paddle/fluid/operators/elementwise/elementwise_op.h
+14
-0
paddle/fluid/operators/group_norm_op.cc
paddle/fluid/operators/group_norm_op.cc
+162
-0
paddle/fluid/operators/group_norm_op.cu
paddle/fluid/operators/group_norm_op.cu
+292
-0
paddle/fluid/operators/group_norm_op.h
paddle/fluid/operators/group_norm_op.h
+197
-0
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+1
-1
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+15
-20
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+172
-34
paddle/fluid/operators/math/detail/activation_functions.h
paddle/fluid/operators/math/detail/activation_functions.h
+1
-0
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+1
-3
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+107
-268
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+214
-12
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+10
-0
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+41
-0
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+9
-6
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+1
-2
paddle/fluid/operators/math/pooling.cu
paddle/fluid/operators/math/pooling.cu
+36
-0
paddle/fluid/operators/math/pooling.h
paddle/fluid/operators/math/pooling.h
+13
-0
paddle/fluid/operators/math/softmax.h
paddle/fluid/operators/math/softmax.h
+2
-1
paddle/fluid/operators/math/softmax_impl.h
paddle/fluid/operators/math/softmax_impl.h
+39
-28
paddle/fluid/operators/reader/create_py_reader_op.cc
paddle/fluid/operators/reader/create_py_reader_op.cc
+1
-1
paddle/fluid/operators/roi_align_op.cc
paddle/fluid/operators/roi_align_op.cc
+3
-3
paddle/fluid/operators/roi_pool_op.cc
paddle/fluid/operators/roi_pool_op.cc
+3
-3
paddle/fluid/operators/softmax_op.h
paddle/fluid/operators/softmax_op.h
+4
-2
paddle/fluid/operators/space_to_depth_op.cc
paddle/fluid/operators/space_to_depth_op.cc
+1
-1
paddle/fluid/operators/stack_op.h
paddle/fluid/operators/stack_op.h
+18
-6
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+8
-4
paddle/fluid/platform/cpu_helper.cc
paddle/fluid/platform/cpu_helper.cc
+8
-1
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+47
-0
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+1
-11
paddle/fluid/platform/dynload/cublas.cc
paddle/fluid/platform/dynload/cublas.cc
+3
-0
paddle/fluid/platform/dynload/cublas.h
paddle/fluid/platform/dynload/cublas.h
+12
-4
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+0
-2
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+15
-55
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+20
-0
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+3
-0
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-7
paddle/fluid/platform/init.h
paddle/fluid/platform/init.h
+0
-3
paddle/fluid/platform/port.h
paddle/fluid/platform/port.h
+31
-4
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+1
-1
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+0
-10
paddle/fluid/platform/stream_callback_manager.h
paddle/fluid/platform/stream_callback_manager.h
+6
-7
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+4
-8
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+6
-18
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+37
-1
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+5
-4
python/paddle/fluid/contrib/inferencer.py
python/paddle/fluid/contrib/inferencer.py
+1
-3
python/paddle/fluid/contrib/trainer.py
python/paddle/fluid/contrib/trainer.py
+1
-2
python/paddle/fluid/contrib/utils/__init__.py
python/paddle/fluid/contrib/utils/__init__.py
+20
-0
python/paddle/fluid/contrib/utils/hdfs_utils.py
python/paddle/fluid/contrib/utils/hdfs_utils.py
+505
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+20
-23
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+58
-60
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+426
-296
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+21
-20
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+8
-1
python/paddle/fluid/tests/book/test_image_classification.py
python/paddle/fluid/tests/book/test_image_classification.py
+1
-1
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+32
-28
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+12
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+8
-6
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
...paddle/fluid/tests/unittests/test_density_prior_box_op.py
+19
-11
python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
...e/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+263
-0
python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+31
-13
python/paddle/fluid/tests/unittests/test_group_norm_op.py
python/paddle/fluid/tests/unittests/test_group_norm_op.py
+143
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
...on/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+7
-2
python/requirements.txt
python/requirements.txt
+1
-1
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+7
-3
tools/manylinux1/build_scripts/build.sh
tools/manylinux1/build_scripts/build.sh
+10
-9
tools/manylinux1/build_scripts/build_utils.sh
tools/manylinux1/build_scripts/build_utils.sh
+23
-3
未找到文件。
.github/ISSUE_TEMPLATE/---feature-request-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
建议(Feature request)
about
:
您可以提出您的建议。 You could use this template for reporting a suggestion issue.
---
欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献!
在留下您的建议时,辛苦您同步提供如下信息:
-
版本、环境信息
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1
2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号
3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14
-
复现信息:如为报错,请给出复现环境、复现步骤
-
建议描述:请您详细描述,您认为需优化的功能
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
Please make sure that this is a feature request.
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
**To Reproduce**
Steps to reproduce the behavior
**Describe the feature and the current behavior/state.**
**Any Other info.**
.github/ISSUE_TEMPLATE/---inference-issue-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
预测(Inference Issue)
about
:
您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID
2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况
3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号
4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本
-预测信息
1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件
2)CMake包含路径的完整命令
3)API信息(如调用请提供)
4)预测库来源:官网下载/特殊环境(如BCLOUD编译)
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that th
If there is no solution,please make sure that this is an inference issue including the following details :
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Python version
-Cmake orders
-C++version.txt
-API information
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---installation-issue-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
安装(Installation Issue)
about
:
您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation
issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
建立issue时,为快速解决问题,请您根据使用情况给出如下信息:
-
标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID
2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况
3)GPU:请提供GPU型号,CUDA和CUDNN版本号
4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本
-
安装方式信息:
1)pip安装/docker安装
2)本地编译:请提供cmake命令,编译命令
3)docker编译:请提供docker镜像,编译命令
特殊环境请注明:如离线安装等
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is an installation issue including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg. Mac OS 10.14)
-Python version
-
Install method: pip install/install with docker/build from source(without docker)/build within docker
-
Other special cases that you think may be related to this problem, eg. offline install, special internet condition
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---model-issue-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
模型(Model Issue)
about
:
您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/
algorithm/dataset issue.
---
为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
建立issue时,为快速解决问题,请您根据使用情况给出如下信息:
-
标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错 ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID
2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况
3)GPU:请提供GPU型号,CUDA和CUDNN版本号
4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本
-
模型信息
1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is a issue of models including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Python version
-Name of Models&Dataset/details of operator
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---others-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
其他(Others)
about
:
如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues
---
为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准概括您的问题
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID
2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号
3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14
4)Python版本号
5)显存信息
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
If there is no solution,please provide us with the following details :
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/cuDNN version
-OS Platform and Distribution(eg.Mac OS 10.14)
-Python version
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.github/ISSUE_TEMPLATE/---training-issue-.md
0 → 100644
浏览文件 @
b5c44fd4
---
name
:
训练(Training issue)
about
:
您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
issue.
---
为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
-
标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ”
-
版本、环境信息:
1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID
2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况
3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号
4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本
-
训练信息
1)单机/多机,单卡/多卡
2)显存信息
3)Operator信息
-
复现信息:如为报错,请给出复现环境、复现步骤
-
问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段
Thank you for contributing to PaddlePaddle.
Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
If there is no solution,please make sure that this is a training issue including the following details:
**System information**
-PaddlePaddle version (eg.1.1)or CommitID
-CPU: including CPUMKL/OpenBlas/MKLDNN version
-GPU: including CUDA/CUDNN version
-OS Platform (eg.Mac OS 10.14)
-Other imformation: Distriuted training/informantion of operator/
Graphics card storage
**To Reproduce**
Steps to reproduce the behavior
**Describe your current behavior**
**Code to reproduce the issue**
**Other info / logs**
.gitignore
浏览文件 @
b5c44fd4
python/paddle/fluid/tests/unittests/reader_reset_test.recordio
paddle/operators/check_t.save
paddle/operators/check_tensor.ls
paddle/operators/tensor.save
...
...
AUTHORS.md
浏览文件 @
b5c44fd4
...
...
@@ -25,6 +25,7 @@
| kexinzhao | Ke-Xin Zhao |
| kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao |
| cjld | Dun Liang |
| lipeng-unisound | Peng Li |
| liuyuan | Yuan Liu |
| livc | Zhao Li |
...
...
@@ -42,6 +43,7 @@
| QiJune | Jun Qi |
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |
...
...
CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -130,6 +130,21 @@ if (APPLE OR WIN32)
"Disable MKL for building on mac and windows"
FORCE
)
endif
()
if
(
WIN32
)
set
(
WITH_AVX OFF CACHE STRING
"Disable AVX when compiling for Windows"
FORCE
)
set
(
WITH_DSO OFF CACHE STRING
"Disable DSO when compiling for Windows"
FORCE
)
set
(
WITH_MKL OFF CACHE STRING
"Disable MKL when compiling for Windows"
FORCE
)
set
(
WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows"
FORCE
)
set
(
WITH_C_API OFF CACHE STRING
"Disable C_API when compiling for Windows"
FORCE
)
set
(
WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows"
FORCE
)
endif
()
set
(
THIRD_PARTY_PATH
"
${
CMAKE_BINARY_DIR
}
/third_party"
CACHE STRING
"A path setting third party libraries download & build directories."
)
...
...
@@ -189,12 +204,14 @@ include(external/eigen) # download eigen3
include
(
external/pybind11
)
# download pybind11
include
(
external/cares
)
include
(
external/cub
)
include
(
external/rocprim
)
include
(
external/xxhash
)
# download xxhash
if
(
NOT WIN32
)
# there is no official support of snappystream, warpctc, nccl, cupti in windows
include
(
external/dlpack
)
include
(
external/snappy
)
# download snappy
include
(
external/snappystream
)
# download snappystream
if
(
NOT WIN32
)
# there is no official support of warpctc, nccl, cupti in windows
include
(
external/warpctc
)
# download, build, install warpctc
include
(
cupti
)
endif
(
NOT WIN32
)
...
...
@@ -302,6 +319,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set
(
CMAKE_CXX_FLAGS_RELWITHDEBINFO
"-O3 -g -DNDEBUG"
)
set
(
CMAKE_C_FLAGS_RELWITHDEBINFO
"-O3 -g -DNDEBUG"
)
if
(
ON_INFER
)
message
(
STATUS
"On inference mode, will take place some specific optimization."
)
add_definitions
(
-DPADDLE_ON_INFERENCE
)
else
()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message
(
WARNING
"On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
)
endif
()
add_subdirectory
(
paddle
)
if
(
WITH_PYTHON
)
add_subdirectory
(
python
)
...
...
@@ -312,10 +337,3 @@ if(WITH_DOC)
find_python_module
(
recommonmark REQUIRED
)
add_subdirectory
(
doc
)
endif
()
if
(
ON_INFER
)
message
(
STATUS
"On inference mode, will take place some specific optimization."
)
else
()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message
(
WARNING
"On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
)
endif
()
Dockerfile
浏览文件 @
b5c44fd4
...
...
@@ -22,6 +22,27 @@ ENV HOME /root
# Add bash enhancements
COPY
./paddle/scripts/docker/root/ /root/
# Prepare packages for Python
RUN
apt-get update
&&
\
apt-get
install
-y
make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
# Install Python3.6
RUN
mkdir
-p
/root/python_build/
&&
wget
-q
https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
&&
\
tar
-zxf
sqlite-autoconf-3250300.tar.gz
&&
cd
sqlite-autoconf-3250300
&&
\
./configure
-prefix
=
/usr/local
&&
make
-j8
&&
make
install
&&
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
&&
\
wget
-q
https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz
&&
\
tar
-xzf
Python-3.6.0.tgz
&&
cd
Python-3.6.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
# Install Python3.7
RUN
wget
-q
https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz
&&
\
tar
-xzf
Python-3.7.0.tgz
&&
cd
Python-3.7.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
RUN
apt-get update
&&
\
apt-get
install
-y
--allow-downgrades
patchelf
\
python3 python3-dev python3-pip
\
...
...
@@ -74,6 +95,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN
pip3
install
-U
wheel
&&
\
pip3
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.6
install
-U
wheel
&&
\
pip3.6
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.6
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.7
install
-U
wheel
&&
\
pip3.7
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.7
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
easy_install
-U
pip
&&
\
pip
install
-U
pip setuptools wheel
&&
\
pip
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
...
...
@@ -82,22 +109,34 @@ RUN pip3 install -U wheel && \
RUN
pip3
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3
install
opencv-python
&&
\
pip3.6
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.6
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.6
install
opencv-python
&&
\
pip3.7
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.7
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.7
install
opencv-python
&&
\
pip
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip
install
opencv-python
#For docstring checker
RUN
pip3
install
pylint pytest astroid isort
RUN
pip3.6
install
pylint pytest astroid isort
RUN
pip3.7
install
pylint pytest astroid isort
RUN
pip
install
pylint pytest astroid isort LinkChecker
COPY
./python/requirements.txt /root/
RUN
pip3
install
-r
/root/requirements.txt
RUN
pip3.6
install
-r
/root/requirements.txt
RUN
pip3.7
install
-r
/root/requirements.txt
RUN
pip
install
-r
/root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
pip3
install
certifi urllib3[secure]
RUN
pip3.6
install
certifi urllib3[secure]
RUN
pip3.7
install
certifi urllib3[secure]
RUN
pip
install
certifi urllib3[secure]
...
...
cmake/external/dlpack.cmake
0 → 100644
浏览文件 @
b5c44fd4
include
(
ExternalProject
)
set
(
DLPACK_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/dlpack
)
set
(
DLPACK_INCLUDE_DIR
${
DLPACK_SOURCE_DIR
}
/src/extern_dlpack/include
)
include_directories
(
${
DLPACK_INCLUDE_DIR
}
)
ExternalProject_Add
(
extern_dlpack
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/dmlc/dlpack.git"
GIT_TAG
"v0.2"
PREFIX
${
DLPACK_SOURCE_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
BUILD_COMMAND
""
INSTALL_COMMAND
""
TEST_COMMAND
""
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.3.0"
)
set
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/dlpack_dummy.c
)
file
(
WRITE
${
dummyfile
}
"const char *dummy =
\"
${
dummyfile
}
\"
;"
)
add_library
(
dlpack STATIC
${
dummyfile
}
)
else
()
add_library
(
dlpack INTERFACE
)
endif
()
add_dependencies
(
dlpack extern_dlpack
)
LIST
(
APPEND externl_project_dependencies dlpack
)
cmake/external/eigen.cmake
浏览文件 @
b5c44fd4
...
...
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
extern_eigen3
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/sabreshao/hipeigen.git"
GIT_TAG
0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
GIT_TAG
7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
PREFIX
${
EIGEN_SOURCE_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
cmake/external/gtest.cmake
浏览文件 @
b5c44fd4
...
...
@@ -50,7 +50,11 @@ IF(WITH_TESTING)
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
CMAKE_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
CMAKE_CXX_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
CMAKE_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
CMAKE_C_FLAGS_RELEASE
}
-DCMAKE_INSTALL_PREFIX=
${
GTEST_INSTALL_DIR
}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_GMOCK=ON
...
...
cmake/external/mkldnn.cmake
浏览文件 @
b5c44fd4
...
...
@@ -53,7 +53,7 @@ ExternalProject_Add(
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLDNN_DEPENDS
}
GIT_REPOSITORY
"https://github.com/01org/mkl-dnn.git"
GIT_TAG
"
21fb5f2af1dd14e132af4f1b79160977ee487818
"
GIT_TAG
"
830a10059a018cd2634d94195140cf2d8790a75a
"
PREFIX
${
MKLDNN_SOURCES_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
cmake/external/rocprim.cmake
0 → 100644
浏览文件 @
b5c44fd4
if
(
NOT WITH_AMD_GPU
)
return
()
endif
()
# rocprim is "ROCm Parallel Primitives" for short.
# It is a header-only library providing HIP and HC parallel primitives
# for developing performant GPU-accelerated code on AMD ROCm platform.
if
(
"x
${
HCC_HOME
}
"
STREQUAL
"x"
)
set
(
HCC_HOME
"/opt/rocm/hcc"
)
endif
()
INCLUDE
(
ExternalProject
)
SET
(
ROCPRIM_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/rocprim
)
SET
(
ROCPRIM_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/rocprim
)
SET
(
ROCPRIM_INCLUDE_DIR
${
ROCPRIM_INSTALL_DIR
}
/include
)
ExternalProject_Add
(
extern_rocprim
GIT_REPOSITORY
"https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc
PREFIX
${
ROCPRIM_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
HCC_HOME
}
/bin/hcc
CMAKE_ARGS -DONLY_INSTALL=ON
CMAKE_ARGS -DBUILD_TEST=OFF
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
ROCPRIM_INSTALL_DIR
}
INSTALL_DIR
${
ROCPRIM_INSTALL_DIR
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
)
INCLUDE_DIRECTORIES
(
${
ROCPRIM_INCLUDE_DIR
}
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.3.0"
)
set
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/rocprim_dummy.c
)
file
(
WRITE
${
dummyfile
}
"const char *dummy_rocprim =
\"
${
dummyfile
}
\"
;"
)
add_library
(
rocprim STATIC
${
dummyfile
}
)
else
()
add_library
(
rocprim INTERFACE
)
endif
()
add_dependencies
(
rocprim extern_rocprim
)
cmake/external/snappy.cmake
浏览文件 @
b5c44fd4
...
...
@@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set
(
SNAPPY_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/snappy
)
set
(
SNAPPY_INCLUDE_DIR
"
${
SNAPPY_INSTALL_DIR
}
/include"
CACHE PATH
"snappy include directory."
FORCE
)
set
(
SNAPPY_LIBRARIES
"
${
SNAPPY_INSTALL_DIR
}
/lib/libsnappy.a"
)
if
(
WIN32
)
set
(
SNAPPY_LIBRARIES
"
${
SNAPPY_INSTALL_DIR
}
/lib/snappy.lib"
)
else
(
WIN32
)
set
(
SNAPPY_LIBRARIES
"
${
SNAPPY_INSTALL_DIR
}
/lib/libsnappy.a"
)
endif
(
WIN32
)
ExternalProject_Add
(
extern_snappy
...
...
@@ -34,8 +38,12 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
CMAKE_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
CMAKE_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
CMAKE_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
CMAKE_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
SNAPPY_INSTALL_DIR
}
-DCMAKE_INSTALL_LIBDIR=
${
SNAPPY_INSTALL_DIR
}
/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
...
...
cmake/external/snappystream.cmake
浏览文件 @
b5c44fd4
...
...
@@ -18,15 +18,19 @@ ENDIF()
include
(
ExternalProject
)
# NOTE: snappy is needed when linking with recordio
set
(
SNAPPYSTREAM_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/snappy_stream
)
set
(
SNAPPYSTREAM_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/snappy_stream
)
set
(
SNAPPYSTREAM_INCLUDE_DIR
"
${
SNAPPYSTREAM_INSTALL_DIR
}
/include"
CACHE PATH
"snappy stream include directory."
FORCE
)
set
(
SNAPPYSTREAM_LIBRARIES
"
${
SNAPPYSTREAM_INSTALL_DIR
}
/lib/libsnappystream.a"
)
if
(
WIN32
)
# Fix me, VS2015 come without VLA support
set
(
SNAPPYSTREAM_LIBRARIES
"
${
SNAPPYSTREAM_INSTALL_DIR
}
/lib/snappystream.lib"
)
MESSAGE
(
WARNING,
"In windows, snappystream has no compile support for windows,
please build it manually and put it at "
${
SNAPPYSTREAM_INSTALL_DIR
}
)
else
(
WIN32
)
set
(
SNAPPYSTREAM_LIBRARIES
"
${
SNAPPYSTREAM_INSTALL_DIR
}
/lib/libsnappystream.a"
)
ExternalProject_Add
(
ExternalProject_Add
(
extern_snappystream
GIT_REPOSITORY
"https://github.com/hoxnox/snappystream.git"
GIT_TAG
"0.2.8"
...
...
@@ -34,8 +38,12 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
CMAKE_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
CMAKE_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
CMAKE_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
CMAKE_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
SNAPPY_INSTALL_DIR
}
-DCMAKE_INSTALL_LIBDIR=
${
SNAPPY_INSTALL_DIR
}
/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
...
...
@@ -47,7 +55,8 @@ ExternalProject_Add(
-DCMAKE_INSTALL_LIBDIR:PATH=
${
SNAPPYSTREAM_INSTALL_DIR
}
/lib
-DCMAKE_BUILD_TYPE:STRING=
${
THIRD_PARTY_BUILD_TYPE
}
DEPENDS snappy
)
)
endif
(
WIN32
)
add_library
(
snappystream STATIC IMPORTED GLOBAL
)
set_property
(
TARGET snappystream PROPERTY IMPORTED_LOCATION
${
SNAPPYSTREAM_LIBRARIES
}
)
...
...
cmake/flags.cmake
浏览文件 @
b5c44fd4
...
...
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
-Wno-error=parentheses-equality
# Warnings in pybind11
-Wno-error=ignored-attributes
# Warnings in Eigen, gcc 6.3
-Wno-error=terminate
# Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context
# Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0
# Warning in tinyformat.h
-Wno-error=maybe-uninitialized
# Warning in boost gcc 7.2
)
set
(
GPU_COMMON_FLAGS
...
...
cmake/generic.cmake
浏览文件 @
b5c44fd4
...
...
@@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
if
(
WIN32
)
target_link_libraries
(
${
TARGET_NAME
}
shlwapi
)
endif
(
WIN32
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_test
(
NAME
${
TARGET_NAME
}
COMMAND
${
TARGET_NAME
}
${
cc_test_ARGS
}
...
...
@@ -451,11 +454,15 @@ function(hip_library TARGET_NAME)
else
()
add_library
(
${
TARGET_NAME
}
STATIC
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE CXX
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
/opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so
)
find_fluid_modules
(
${
TARGET_NAME
}
)
endif
()
if
(
hip_library_DEPS
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
if
(
"
${
hip_library_DEPS
}
"
MATCHES
"ARCHIVE_START"
)
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
list
(
REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END
)
else
()
target_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
endif
()
# cpplint code style
...
...
cmake/hip.cmake
浏览文件 @
b5c44fd4
...
...
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
endif
()
include_directories
(
"/opt/rocm/include"
)
include_directories
(
"/opt/rocm/hip/include"
)
include_directories
(
"/opt/rocm/miopen/include"
)
include_directories
(
"/opt/rocm/hipblas/include"
)
include_directories
(
"/opt/rocm/hiprand/include"
)
include_directories
(
"/opt/rocm/rocrand/include"
)
...
...
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
list
(
APPEND EXTERNAL_LIBS
"-L/opt/rocm/lib/ -lhip_hcc"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
4
"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
1
"
)
if
(
WITH_DSO
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_USE_DSO"
)
endif
(
WITH_DSO
)
if
(
WITH_DOUBLE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_TYPE_DOUBLE"
)
endif
(
WITH_DOUBLE
)
if
(
WITH_TESTING
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_TESTING"
)
endif
(
WITH_TESTING
)
if
(
WITH_DISTRIBUTE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_DISTRIBUTE"
)
endif
(
WITH_DISTRIBUTE
)
if
(
WITH_GRPC
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_GRPC"
)
endif
(
WITH_GRPC
)
if
(
NOT WITH_GOLANG
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITHOUT_GOLANG"
)
endif
(
NOT WITH_GOLANG
)
if
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_MKLDNN"
)
endif
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DANY_IMPL_ANY_CAST_MOVEABLE"
)
if
(
NOT WITH_RDMA
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_DISABLE_RDMA"
)
endif
(
NOT WITH_RDMA
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
list
(
APPEND HIP_HCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"RelWithDebInfo"
)
...
...
cmake/operators.cmake
浏览文件 @
b5c44fd4
...
...
@@ -84,9 +84,7 @@ function(op_library TARGET)
endif
()
if
(
WIN32
)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
"hierarchical_sigmoid_op"
"crf_decoding_op"
"select_op"
"lstmp_op"
"gru_op"
"fusion_gru_op"
"lstm_op"
"fusion_lstm_op"
"cumsum_op"
"fusion_seqconv_eltadd_relu_op"
"channel_send_op"
"channel_create_op"
"channel_close_op"
"channel_recv_op"
)
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
windows_unsupport_op
}
"
)
return
()
endif
()
...
...
cmake/simd.cmake
浏览文件 @
b5c44fd4
...
...
@@ -57,43 +57,46 @@ int main()
return 0;
}"
SSE3_FOUND
)
# Check AVX
set
(
CMAKE_REQUIRED_FLAGS
${
AVX_FLAG
}
)
set
(
AVX_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
# disable AVX by default on windows
if
(
NOT WIN32
)
# Check AVX
set
(
CMAKE_REQUIRED_FLAGS
${
AVX_FLAG
}
)
set
(
AVX_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 result = _mm256_add_ps (a, b);
return 0;
}"
AVX_FOUND
)
}"
AVX_FOUND
)
# Check AVX 2
set
(
CMAKE_REQUIRED_FLAGS
${
AVX2_FLAG
}
)
set
(
AVX2_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
# Check AVX 2
set
(
CMAKE_REQUIRED_FLAGS
${
AVX2_FLAG
}
)
set
(
AVX2_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
return 0;
}"
AVX2_FOUND
)
}"
AVX2_FOUND
)
# Check AVX512F
set
(
CMAKE_REQUIRED_FLAGS
${
AVX512F_FLAG
}
)
set
(
AVX512F_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
# Check AVX512F
set
(
CMAKE_REQUIRED_FLAGS
${
AVX512F_FLAG
}
)
set
(
AVX512F_FOUND_EXITCODE 1 CACHE STRING
"Result from TRY_RUN"
FORCE
)
CHECK_CXX_SOURCE_RUNS
(
"
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a);
return 0;
}"
AVX512F_FOUND
)
}"
AVX512F_FOUND
)
endif
(
NOT WIN32
)
set
(
CMAKE_REQUIRED_FLAGS
${
CMAKE_REQUIRED_FLAGS_RETAINED
}
)
mark_as_advanced
(
MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND
)
paddle/fluid/API.spec
浏览文件 @
b5c44fd4
...
...
@@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
...
...
@@ -275,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5
, None))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False
, None))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
...
...
@@ -341,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'
))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
, 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None
))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
...
...
paddle/fluid/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -3,13 +3,9 @@ add_subdirectory(platform)
add_subdirectory
(
framework
)
add_subdirectory
(
operators
)
add_subdirectory
(
string
)
add_subdirectory
(
pybind
)
if
(
NOT WIN32
)
add_subdirectory
(
recordio
)
endif
(
NOT WIN32
)
add_subdirectory
(
pybind
)
# NOTE: please add subdirectory inference at last.
add_subdirectory
(
inference
)
add_subdirectory
(
train
)
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
endfunction
()
add_subdirectory
(
ir
)
if
(
NOT WIN32
)
add_subdirectory
(
details
)
endif
(
NOT WIN32
)
# ddim lib
proto_library
(
framework_proto SRCS framework.proto
)
...
...
@@ -68,11 +66,7 @@ if(WITH_GPU)
else
()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
if
(
NOT WIN32
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
else
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
endif
(
NOT WIN32
)
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
...
...
@@ -123,8 +117,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library
(
shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context
)
if
(
NOT WIN32
)
cc_library
(
transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto
)
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler
)
shape_inference data_transform lod_tensor profiler
transfer_scope_cache
)
else
()
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor
)
...
...
@@ -183,12 +178,10 @@ else()
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
if
(
NOT WIN32
)
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor
)
endif
()
# NOT WIN32
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
...
...
@@ -205,3 +198,6 @@ cc_test(tuple_test SRCS tuple_test.cc )
if
(
NOT WIN32
)
cc_test
(
rw_lock_test SRCS rw_lock_test.cc
)
endif
(
NOT WIN32
)
cc_library
(
dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack
)
cc_test
(
dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog
)
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
b5c44fd4
...
...
@@ -13,9 +13,9 @@
// limitations under the License.
#pragma once
#include <ThreadPool.h>
#include <string>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
...
...
paddle/fluid/framework/dlpack_tensor.cc
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
internal
{
template
<
typename
T
>
static
::
DLDataType
GetDLDataTypeCode
()
{
::
DLDataType
dtype
;
if
(
std
::
is_same
<
T
,
platform
::
float16
>::
value
||
std
::
is_floating_point
<
T
>::
value
)
{
dtype
.
code
=
kDLFloat
;
}
else
if
(
std
::
is_unsigned
<
T
>::
value
)
{
dtype
.
code
=
kDLUInt
;
}
else
if
(
std
::
is_integral
<
T
>::
value
)
{
dtype
.
code
=
kDLInt
;
}
else
{
PADDLE_THROW
(
"Unsupported data type %s"
,
typeid
(
T
).
name
());
}
dtype
.
bits
=
8
*
sizeof
(
T
);
dtype
.
lanes
=
1
;
return
dtype
;
}
static
DLDataType
GetDLDataTypeFromTypeIndex
(
const
std
::
type_index
&
type
)
{
#define REG_DL_DATA_TYPE(type) \
{ std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
static
const
std
::
unordered_map
<
std
::
type_index
,
::
DLDataType
>
type_to_dtype_map
({
REG_DL_DATA_TYPE
(
platform
::
float16
),
// NOLINT
REG_DL_DATA_TYPE
(
float
),
// NOLINT
REG_DL_DATA_TYPE
(
double
),
// NOLINT
REG_DL_DATA_TYPE
(
int
),
// NOLINT
REG_DL_DATA_TYPE
(
int64_t
),
// NOLINT
REG_DL_DATA_TYPE
(
bool
),
// NOLINT
REG_DL_DATA_TYPE
(
size_t
),
// NOLINT
REG_DL_DATA_TYPE
(
int16_t
),
// NOLINT
REG_DL_DATA_TYPE
(
uint8_t
),
// NOLINT
REG_DL_DATA_TYPE
(
int8_t
)
// NOLINT
});
static
auto
type_to_dtype_map_end_it
=
type_to_dtype_map
.
end
();
auto
it
=
type_to_dtype_map
.
find
(
type
);
PADDLE_ENFORCE
(
it
!=
type_to_dtype_map_end_it
,
"Unsupported data type %s"
,
type
.
name
());
return
it
->
second
;
#undef REG_DL_DATA_TYPE
}
struct
DLContextVisitor
:
public
boost
::
static_visitor
<::
DLContext
>
{
inline
::
DLContext
operator
()(
const
platform
::
CPUPlace
&
place
)
const
{
DLContext
ctx
;
ctx
.
device_type
=
kDLCPU
;
ctx
.
device_id
=
0
;
return
ctx
;
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#ifdef PADDLE_WITH_CUDA
DLContext
ctx
;
ctx
.
device_type
=
kDLGPU
;
ctx
.
device_id
=
place
.
device
;
return
ctx
;
#else
PADDLE_THROW
(
"platform::CUDAPlace is not supported in CPU only version"
);
#endif
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
const
{
#ifdef PADDLE_WITH_CUDA
DLContext
ctx
;
ctx
.
device_type
=
kDLCPUPinned
;
ctx
.
device_id
=
0
;
return
ctx
;
#else
PADDLE_THROW
(
"platform::CUDAPinnedPlace is not supported in CPU only version"
);
#endif
}
};
}
// namespace internal
DLPackTensor
::
DLPackTensor
(
const
Tensor
&
tensor
,
LaneType
lanes
)
{
// init data, data buffer
t_
.
data
=
const_cast
<
void
*>
(
tensor
.
data
<
void
>
());
// init ctx, DLContext type with device_type and device_id
auto
place
=
tensor
.
place
();
t_
.
ctx
=
boost
::
apply_visitor
(
internal
::
DLContextVisitor
(),
place
);
// init dtype
t_
.
dtype
=
internal
::
GetDLDataTypeFromTypeIndex
(
tensor
.
type
());
t_
.
dtype
.
lanes
=
lanes
;
// init ndim, tensor rank
auto
&
dims
=
tensor
.
dims
();
using
DimType
=
decltype
(
t_
.
ndim
);
// int
t_
.
ndim
=
static_cast
<
DimType
>
(
dims
.
size
());
// init shape, tensor dims
t_
.
shape
=
shape_
;
for
(
DimType
i
=
0
;
i
<
t_
.
ndim
;
++
i
)
{
t_
.
shape
[
i
]
=
dims
[
i
];
}
// init strides, nullptr means the tensor is compact
t_
.
strides
=
nullptr
;
// init byte_offset
t_
.
byte_offset
=
0
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/dlpack_tensor.h
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <dlpack/dlpack.h>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
framework
{
class
DLPackTensor
{
public:
using
LaneType
=
decltype
(
::
DLTensor
::
dtype
.
lanes
);
// uint16_t
using
ShapeType
=
std
::
remove_reference
<
decltype
(
::
DLTensor
::
shape
[
0
])
>::
type
;
// int64_t
// lanes is only used in CPU to enable vectorization
explicit
DLPackTensor
(
const
Tensor
&
tensor
,
LaneType
lanes
=
1
);
inline
operator
const
::
DLTensor
&
()
const
{
return
t_
;
}
inline
operator
::
DLTensor
&
()
{
return
t_
;
}
private:
::
DLTensor
t_
;
// The shape in DLTensor is defined as int64_t*
// Add this member to make TVMTensor init without heap allocation
ShapeType
shape_
[
9
];
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/dlpack_tensor_test.cc
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <vector>
namespace
paddle
{
namespace
framework
{
namespace
{
// NOLINT
template
<
typename
T
>
constexpr
uint8_t
GetDLDataTypeCode
()
{
return
std
::
is_same
<
platform
::
float16
,
T
>::
value
||
std
::
is_floating_point
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLFloat
)
:
(
std
::
is_unsigned
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLUInt
)
:
(
std
::
is_integral
<
T
>::
value
?
static_cast
<
uint8_t
>
(
kDLInt
)
:
static_cast
<
uint8_t
>
(
-
1
)));
}
}
// NOLINT
template
<
typename
T
>
void
TestMain
(
const
platform
::
Place
&
place
,
uint16_t
lanes
)
{
DDim
dims
{
4
,
5
,
6
,
7
};
Tensor
tensor
;
tensor
.
Resize
(
dims
);
void
*
p
=
tensor
.
mutable_data
<
T
>
(
place
);
DLPackTensor
dlpack_tensor
(
tensor
,
lanes
);
::
DLTensor
&
dl_tensor
=
dlpack_tensor
;
CHECK_EQ
(
p
,
dl_tensor
.
data
);
if
(
platform
::
is_cpu_place
(
place
))
{
CHECK_EQ
(
kDLCPU
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
0
,
dl_tensor
.
ctx
.
device_id
);
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
CHECK_EQ
(
kDLGPU
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
,
dl_tensor
.
ctx
.
device_id
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
place
))
{
CHECK_EQ
(
kDLCPUPinned
,
dl_tensor
.
ctx
.
device_type
);
CHECK_EQ
(
0
,
dl_tensor
.
ctx
.
device_id
);
}
else
{
CHECK_EQ
(
false
,
true
);
}
CHECK_EQ
(
dims
.
size
(),
dl_tensor
.
ndim
);
for
(
auto
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
CHECK_EQ
(
dims
[
i
],
dl_tensor
.
shape
[
i
]);
}
CHECK_EQ
(
dl_tensor
.
strides
==
nullptr
,
true
);
CHECK_EQ
(
static_cast
<
uint64_t
>
(
0
),
dl_tensor
.
byte_offset
);
CHECK_EQ
(
lanes
,
dl_tensor
.
dtype
.
lanes
);
CHECK_EQ
(
sizeof
(
T
)
*
8
,
dl_tensor
.
dtype
.
bits
);
CHECK_EQ
(
GetDLDataTypeCode
<
T
>
(),
dl_tensor
.
dtype
.
code
);
}
template
<
typename
T
>
void
TestMainLoop
()
{
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
(),
platform
::
CUDAPlace
(
0
),
platform
::
CUDAPinnedPlace
()};
if
(
platform
::
GetCUDADeviceCount
()
>
1
)
{
places
.
emplace_back
(
platform
::
CUDAPlace
(
1
));
}
#else
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
()};
#endif
std
::
vector
<
uint16_t
>
lanes
{
1
,
2
};
for
(
auto
&
p
:
places
)
{
for
(
auto
&
l
:
lanes
)
{
TestMain
<
T
>
(
p
,
l
);
}
}
}
#define PADDLE_DLPACK_TEST(type) \
TEST(dlpack, test_##type) { TestMainLoop<type>(); }
using
float16
=
platform
::
float16
;
PADDLE_DLPACK_TEST
(
float16
);
PADDLE_DLPACK_TEST
(
float
);
PADDLE_DLPACK_TEST
(
double
);
PADDLE_DLPACK_TEST
(
int
);
PADDLE_DLPACK_TEST
(
int64_t
);
PADDLE_DLPACK_TEST
(
bool
);
PADDLE_DLPACK_TEST
(
size_t
);
PADDLE_DLPACK_TEST
(
int16_t
);
PADDLE_DLPACK_TEST
(
uint8_t
);
PADDLE_DLPACK_TEST
(
int8_t
);
#undef PADDLE_DLPACK_TEST
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/eigen.h
浏览文件 @
b5c44fd4
...
...
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/tensor.h"
#include "unsupported/Eigen/CXX11/Tensor"
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
b5c44fd4
...
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
b5c44fd4
...
...
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
<<
" on scope "
<<
scope_
;
op
->
SetIsCalledByExecutor
(
false
);
op
->
Run
(
*
scope_
,
place_
);
}
}
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
b5c44fd4
...
...
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
break
;
}
case
proto
::
AttrType
::
LONGS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from LONGS to LONGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int64_t
>
();
break
;
}
case
proto
::
AttrType
::
FLOATS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
...
...
paddle/fluid/framework/op_registry.h
浏览文件 @
b5c44fd4
...
...
@@ -23,11 +23,6 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_set>
#if defined(_WIN32)
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif
#include "glog/logging.h" // For VLOG()
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/details/op_registry.h"
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
b5c44fd4
...
...
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include <gflags/gflags.h>
#include <glog/logging.h>
...
...
@@ -24,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -35,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
namespace
paddle
{
namespace
framework
{
// Combine two hash values to a single hash.
inline
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
std
::
vector
<
std
::
tuple
<
platform
::
Place
,
LibraryType
>>
kKernelPriority
=
{
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kCUDNN
),
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kPlain
),
...
...
@@ -799,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope
*
OperatorWithKernel
::
TryTransferData
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
{
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local
std
::
unordered_map
<
size_t
,
Scope
*>
infer_transfer_scope_cache
;
#endif
Scope
*
new_scope
=
nullptr
;
for
(
auto
&
var_name_item
:
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
...
...
@@ -840,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG
(
30
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
#ifdef PADDLE_ON_INFERENCE
size_t
infer_cache_key
=
CombineHash
(
OpKernelType
::
Hash
()(
kernel_type_for_var
),
OpKernelType
::
Hash
()(
expected_kernel_key
));
infer_cache_key
=
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
&
scope
));
auto
it
=
infer_transfer_scope_cache
.
find
(
infer_cache_key
);
if
(
it
!=
infer_transfer_scope_cache
.
end
())
{
new_scope
=
infer_transfer_scope_cache
[
infer_cache_key
];
}
else
{
new_scope
=
&
scope
.
NewScope
();
infer_transfer_scope_cache
[
infer_cache_key
]
=
new_scope
;
}
#endif
if
(
new_scope
==
nullptr
)
{
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
// over the running of operators.
// We use a thread_local cache to fix that issue, the key in the cache is
// the combination of the `scope` argument, from_kernel_type,
// target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some
// changes on this logic for this macro might not tested on the other
// scenerios.
// If this op is not called by an Executor or ParallelExecutor, it should
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
// variables, that behavior a lot different.
if
(
!
run_by_executor_
)
{
new_scope
=
TryCreateTransferScope
(
kernel_type_for_var
,
expected_kernel_key
,
&
scope
);
}
if
(
!
new_scope
)
{
new_scope
=
&
scope
.
NewScope
();
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
b5c44fd4
...
...
@@ -20,8 +20,6 @@ limitations under the License. */
#include <tuple>
#include <unordered_map>
#include <vector>
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "glog/logging.h" // For VLOG
#include "paddle/fluid/framework/attribute.h"
...
...
@@ -100,6 +98,7 @@ class OperatorBase {
const
std
::
string
&
Type
()
const
{
return
type_
;
}
bool
HasAttr
(
const
std
::
string
&
name
)
const
{
return
attrs_
.
count
(
name
);
}
template
<
typename
T
>
inline
const
T
&
Attr
(
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
attrs_
.
count
(
name
)
!=
0
,
"%s should be in AttributeMap"
,
...
...
@@ -128,6 +127,8 @@ class OperatorBase {
//! Get all outputs variable names
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
protected:
std
::
string
type_
;
// NOTE: in case of OpGrad, inputs_ contains:
...
...
@@ -140,6 +141,8 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap
outputs_
;
AttributeMap
attrs_
;
// Whether this operator executes in an Executor.
bool
run_by_executor_
{
true
};
private:
void
GenerateTemporaryNames
();
...
...
paddle/fluid/framework/transfer_scope_cache.cc
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/transfer_scope_cache.h"
namespace
paddle
{
namespace
framework
{
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
()
{
thread_local
auto
*
x
=
new
std
::
unordered_map
<
size_t
,
Scope
*>
;
return
*
x
;
}
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
()
{
thread_local
auto
*
x
=
new
std
::
unordered_set
<
Scope
*>
;
return
*
x
;
}
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
)
{
Scope
*
new_scope
{
nullptr
};
size_t
infer_cache_key
=
CombineHash
(
OpKernelType
::
Hash
()(
type0
),
OpKernelType
::
Hash
()(
type1
));
infer_cache_key
=
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
scope
));
auto
it
=
global_transfer_data_cache
().
find
(
infer_cache_key
);
if
(
it
!=
global_transfer_data_cache
().
end
())
{
new_scope
=
global_transfer_data_cache
()[
infer_cache_key
];
}
else
{
new_scope
=
&
scope
->
NewScope
();
global_transfer_data_cache
()[
infer_cache_key
]
=
new_scope
;
}
global_transfer_scope_cache
().
insert
(
new_scope
);
return
new_scope
;
}
void
RemoveKidsFromTransferScopeCache
(
Scope
*
scope
)
{
auto
it
=
global_transfer_scope_cache
().
find
(
scope
);
if
(
it
!=
global_transfer_scope_cache
().
end
())
{
global_transfer_scope_cache
().
erase
(
it
);
}
for
(
auto
*
s
:
scope
->
kids
())
{
auto
it
=
global_transfer_scope_cache
().
find
(
s
);
if
(
it
!=
global_transfer_scope_cache
().
end
())
{
global_transfer_scope_cache
().
erase
(
it
);
}
}
// remove global transfer data cache
auto
&
cache
=
global_transfer_data_cache
();
for
(
auto
it
=
cache
.
begin
();
it
!=
cache
.
end
();)
{
if
(
it
->
second
==
scope
)
it
=
cache
.
erase
(
it
);
else
it
++
;
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/transfer_scope_cache.h
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread> // NOLINT
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
();
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
();
// Combine two hash values to a single hash.
static
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
);
void
RemoveKidsFromTransferScopeCache
(
Scope
*
scope
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -4,6 +4,7 @@ endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory
(
analysis
)
add_subdirectory
(
utils
)
if
(
TENSORRT_FOUND
)
add_subdirectory
(
tensorrt
)
endif
()
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -7,16 +7,17 @@ set(analysis_deps # analysis_deps can be extended accross the project
add_subdirectory
(
ir_passes
)
add_subdirectory
(
passes
)
cc_library
(
ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass
${
INFER_IR_PASSES
}
)
cc_library
(
analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api
)
cc_library
(
ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass
${
INFER_IR_PASSES
}
analysis_helper
)
cc_library
(
argument SRCS argument.cc DEPS scope proto_desc
)
cc_library
(
analysis_pass SRCS analysis_pass.cc DEPS proto_desc
)
cc_library
(
analysis SRCS
analyzer.cc
helper.cc
analysis_pass
DEPS
${
analysis_deps
}
DEPS
${
analysis_deps
}
analysis_helper
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
...
...
@@ -34,4 +35,4 @@ function(inference_analysis_test TARGET)
endif
()
endfunction
(
inference_analysis_test
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS
reset_tensor_array
paddle_inference_api
)
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
b5c44fd4
...
...
@@ -30,6 +30,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
Argument
argument
;
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
@@ -41,6 +42,7 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument
.
SetTensorRtWorkspaceSize
(
1
<<
20
);
argument
.
SetModelDir
(
FLAGS_inference_model_dir
);
argument
.
SetIrAnalysisPasses
({
"infer_clean_graph_pass"
});
argument
.
SetUseGPU
(
false
);
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
b5c44fd4
...
...
@@ -116,6 +116,7 @@ struct Argument {
std
::
vector
<
std
::
string
>
);
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
DECL_ARGUMENT_FIELD
(
tensorrt_node_teller
,
TensorRtNodeTeller
,
std
::
function
<
bool
(
const
framework
::
ir
::
Node
*
)
>
);
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -4,4 +4,6 @@ set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
b5c44fd4
...
...
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output.
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
index
++
)
{
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
b5c44fd4
...
...
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
});
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
...
...
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
浏览文件 @
b5c44fd4
...
...
@@ -30,15 +30,28 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
if
(
!
argument
->
scope_valid
())
{
argument
->
SetScope
(
new
framework
::
Scope
);
}
PADDLE_ENFORCE
(
argument
->
use_gpu_valid
());
// The load program should run on the same device with the inference program,
// so that the parameters will on the same device, or they will keep copying
// between difference devices.
platform
::
Place
place
;
if
(
argument
->
use_gpu
())
{
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
place
=
platform
::
CUDAPlace
(
argument
->
gpu_device_id
());
}
else
{
place
=
platform
::
CPUPlace
();
}
if
(
argument
->
model_dir_valid
())
{
auto
program
=
LoadModel
(
argument
->
model_dir
(),
argument
->
scope_ptr
());
auto
program
=
LoadModel
(
argument
->
model_dir
(),
argument
->
scope_ptr
(),
place
);
argument
->
SetMainProgram
(
program
.
release
());
}
else
if
(
argument
->
model_program_path_valid
()
&&
argument
->
model_params_path_valid
())
{
auto
program
=
LoadModel
(
argument
->
model_program_path
(),
argument
->
model_params_path
(),
argument
->
scope_ptr
());
argument
->
scope_ptr
()
,
place
);
argument
->
SetMainProgram
(
program
.
release
());
}
else
{
PADDLE_THROW
(
...
...
@@ -52,16 +65,15 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
}
std
::
unique_ptr
<
framework
::
ProgramDesc
>
IrGraphBuildPass
::
LoadModel
(
const
std
::
string
&
path
,
framework
::
Scope
*
scope
)
{
platform
::
CPUPlace
place
;
const
std
::
string
&
path
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
)
{
framework
::
Executor
exe
(
place
);
return
Load
(
&
exe
,
scope
,
path
);
}
std
::
unique_ptr
<
framework
::
ProgramDesc
>
IrGraphBuildPass
::
LoadModel
(
const
std
::
string
&
program_path
,
const
std
::
string
&
params_path
,
framework
::
Scope
*
scope
)
{
platform
::
CPUPlace
place
;
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
)
{
framework
::
Executor
exe
(
place
);
return
Load
(
&
exe
,
scope
,
program_path
,
params_path
);
}
...
...
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
浏览文件 @
b5c44fd4
...
...
@@ -17,6 +17,7 @@
#include <string>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -32,11 +33,12 @@ class IrGraphBuildPass : public AnalysisPass {
std
::
string
repr
()
const
override
;
private:
std
::
unique_ptr
<
framework
::
ProgramDesc
>
LoadModel
(
const
std
::
string
&
path
,
framework
::
Scope
*
scope
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
LoadModel
(
const
std
::
string
&
path
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
LoadModel
(
const
std
::
string
&
program_path
,
const
std
::
string
&
params_path
,
framework
::
Scope
*
scope
);
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
);
std
::
string
model_binary_str_
;
};
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -27,11 +27,10 @@ endif()
cc_library
(
reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope
)
cc_library
(
analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder
)
cc_library
(
paddle_pass_builder SRCS paddle_pass_builder.cc
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor
)
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
b5c44fd4
...
...
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
...
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
b5c44fd4
...
...
@@ -35,7 +35,6 @@
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
...
...
@@ -67,7 +66,7 @@ bool AnalysisPredictor::Init(
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
!
PrepareScope
(
parent_scope
))
{
return
false
;
...
...
@@ -160,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() {
return
true
;
}
void
AnalysisPredictor
::
SetMkldnnThreadID
(
int
tid
)
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
tid
);
#else
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
#endif
}
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
...
...
@@ -285,6 +292,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_
=
true
;
argument_
.
SetUseGPU
(
config_
.
use_gpu
);
argument_
.
SetGPUDeviceId
(
config_
.
device
);
// Analyze inference_program
if
(
!
config_
.
model_dir
.
empty
())
{
argument_
.
SetModelDir
(
config_
.
model_dir
);
...
...
@@ -491,8 +499,7 @@ bool AnalysisPredictor::LoadParameters() {
}
// Use NaiveExecutor to Load parameters.
platform
::
CPUPlace
place
;
framework
::
NaiveExecutor
e
(
place
);
framework
::
NaiveExecutor
e
(
place_
);
e
.
Prepare
(
scope_
.
get
(),
*
load_program
,
0
,
false
);
e
.
Run
();
VLOG
(
3
)
<<
"get "
<<
scope_
->
LocalVarNames
().
size
()
<<
" vars after load"
;
...
...
@@ -551,4 +558,5 @@ USE_TRT_CONVERTER(pad);
USE_TRT_CONVERTER
(
split
);
USE_TRT_CONVERTER
(
prelu
);
USE_TRT_CONVERTER
(
conv2d_transpose
);
USE_TRT_CONVERTER
(
leaky_relu
);
#endif
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
b5c44fd4
...
...
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
framework
::
Scope
*
scope
()
{
return
scope_
.
get
();
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
void
SetMkldnnThreadID
(
int
tid
);
protected:
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
b5c44fd4
...
...
@@ -28,7 +28,6 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
{
...
...
@@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init(
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
b5c44fd4
...
...
@@ -14,12 +14,6 @@ limitations under the License. */
#pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h>
#include <map>
#include <memory>
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -46,8 +46,6 @@ if(WITH_GPU)
endif
()
endif
(
NOT WIN32
)
endif
()
include_directories
(
"D:/Paddle/"
)
include_directories
(
"
${
PADDLE_LIB
}
"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
b5c44fd4
...
...
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
int
max_batch_size
=
1
);
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
void
EnableMKLDNN
();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
void
EnableMKLDNN
();
bool
use_mkldnn
()
const
{
return
use_mkldnn_
;
}
friend
class
::
paddle
::
AnalysisPredictor
;
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
b5c44fd4
...
...
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool
specify_input_name
{
false
};
// Set and get the number of cpu math library threads.
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
}
protected:
// number of cpu math library (such as MKL, OpenBlas) threads for each
// instance.
int
cpu_math_library_num_threads_
{
1
};
};
// A factory to help create different predictors.
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
b5c44fd4
...
...
@@ -116,8 +116,12 @@ class CpuPassStrategy : public PassStrategy {
class
GpuPassStrategy
:
public
PassStrategy
{
public:
GpuPassStrategy
()
:
PassStrategy
({})
{
// TODO(NHZlX) Problem with Data synchronization between GPU and CPU
// When running in GPU mode, the parameters are all on GPU. But the
// opearations of "conv_bn_fuse_pass" are on CPU.
passes_
.
assign
({
"infer_clean_graph_pass"
,
"conv_bn_fuse_pass"
,
"infer_clean_graph_pass"
,
// "infer_clean_graph_pass", "conv_bn_fuse_pass",
});
}
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
b5c44fd4
# Add TRT tests
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc p
relu_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc leaky_
relu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
...
@@ -18,9 +18,10 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
nv_test
(
test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine conv_op conv_transpose_op SERIAL
)
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine pool_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine pool_op
tensorrt_plugin
SERIAL
)
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine elementwise_add_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_plugin
elementwise_add_op elementwise_mul_op SERIAL
)
nv_test
(
test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine softmax_op SERIAL
)
nv_test
(
test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
...
...
@@ -37,3 +38,5 @@ nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
nv_test
(
test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine tensorrt_plugin
prelu_op SERIAL
)
nv_test
(
test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OPERATOR_DEPS
}
tensorrt_engine activation_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
...
...
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
static
bool
CheckDims
(
const
nvinfer1
::
Dims
&
dims_x
,
const
nvinfer1
::
Dims
&
dims_y
)
{
if
(
dims_x
.
nbDims
!=
dims_y
.
nbDims
)
{
return
false
;
}
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
{
return
false
;
}
}
return
true
;
}
class
ElementwiseWeightOpConverter
:
public
OpConverter
{
public:
ElementwiseWeightOpConverter
()
{}
...
...
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"
convert a fluid elementwise op to tensorrt
IScaleLayer"
;
VLOG
(
3
)
<<
"
Convert a fluid elementwise op to TensorRT
IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
...
...
@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
ElementwiseTensorOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
auto
op_pair
=
ops
.
find
(
op_type_
);
PADDLE_ENFORCE
(
op_pair
!=
ops
.
end
(),
"Wrong elementwise op type!"
);
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
...
...
@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
nvinfer1
::
Dims
dims_y
=
Y
->
getDimensions
();
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
if
(
CheckDims
(
dims_x
,
dims_y
))
{
// The two input tensor should have the same dims
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
if
(
dims_x
.
nbDims
==
dims_y
.
nbDims
)
{
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IElementWiseLayer"
;
auto
op_pair
=
ops
.
find
(
op_type_
);
if
(
op_pair
==
ops
.
end
())
{
PADDLE_THROW
(
"Wrong elementwise op type!"
);
}
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"elementwise (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
}
else
{
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer"
;
plugin
::
ElementWisePlugin
*
plugin
=
new
plugin
::
ElementWisePlugin
(
op_pair
->
second
,
dims_x
,
dims_y
,
axis
);
plugin
->
AddInput
(
X
);
plugin
->
AddInput
(
Y
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
const_cast
<
nvinfer1
::
ITensor
*
const
*>
(
plugin
->
GetInputs
().
data
()),
2
,
reinterpret_cast
<
plugin
::
PluginTensorRT
*>
(
plugin
));
layer
->
setName
((
"elementwise (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
}
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
...
...
paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
// LeakyRelu converter from fluid to tensorRT
class
LeakyReluOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert fluid leaky_relu op to tensorrt layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
int
input_num
=
op_desc
.
Input
(
"X"
).
size
();
PADDLE_ENFORCE
(
input_num
==
1
);
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
// Get output
size_t
output_num
=
op_desc
.
Output
(
"Out"
).
size
();
PADDLE_ENFORCE
(
output_num
==
1
);
// Get attrs
float
alpha
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"alpha"
));
platform
::
CPUPlace
place
;
std
::
unique_ptr
<
framework
::
LoDTensor
>
alpha_tensor
(
new
framework
::
LoDTensor
());
alpha_tensor
->
Resize
(
framework
::
make_ddim
({
2
}));
float
*
alpha_data
=
alpha_tensor
->
mutable_data
<
float
>
(
place
);
alpha_data
[
0
]
=
alpha
;
alpha_data
[
1
]
=
1.
f
-
alpha
;
// the leaky relu formula y = (x > 0) ? x : alpha * x is equal to
// y = alpha * x + (x > 0) ? (1 - alpha) * x : 0
TensorRTEngine
::
Weight
scale
{
nvinfer1
::
DataType
::
kFLOAT
,
&
alpha_data
[
0
],
1
};
TensorRTEngine
::
Weight
shift
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
TensorRTEngine
::
Weight
power
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
// y_scale = alpha * x
auto
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
input
,
nvinfer1
::
ScaleMode
::
kUNIFORM
,
shift
.
get
(),
scale
.
get
(),
power
.
get
());
PADDLE_ENFORCE
(
nullptr
!=
scale_layer
);
// y_relu = (x > 0) : x : 0
auto
*
relu_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Activation
,
*
input
,
nvinfer1
::
ActivationType
::
kRELU
);
PADDLE_ENFORCE
(
nullptr
!=
relu_layer
);
//
TensorRTEngine
::
Weight
sub_scale
{
nvinfer1
::
DataType
::
kFLOAT
,
&
alpha_data
[
1
],
1
};
auto
*
scale_relu_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
(
relu_layer
->
getOutput
(
0
)),
nvinfer1
::
ScaleMode
::
kUNIFORM
,
shift
.
get
(),
sub_scale
.
get
(),
power
.
get
());
PADDLE_ENFORCE
(
nullptr
!=
scale_relu_layer
);
auto
*
output_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
(
scale_layer
->
getOutput
(
0
)),
*
(
scale_relu_layer
->
getOutput
(
0
)),
nvinfer1
::
ElementWiseOperation
::
kSUM
);
PADDLE_ENFORCE
(
nullptr
!=
output_layer
);
// keep alpha tensor to avoid release it's memory
std
::
string
alpha_name
=
op_desc
.
Output
(
"Out"
)[
0
]
+
"_alpha"
;
PADDLE_ENFORCE
(
engine_
->
weight_map
.
find
(
alpha_name
)
==
engine_
->
weight_map
.
end
());
engine_
->
weight_map
[
alpha_name
]
=
std
::
move
(
alpha_tensor
);
std
::
string
layer_name
=
"leaky_relu (Output: "
;
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
output_layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
output_layer
->
getOutput
(
0
));
layer_name
+=
output_name
;
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
output_layer
->
setName
((
layer_name
+
")"
).
c_str
());
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
leaky_relu
,
LeakyReluOpConverter
);
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
b5c44fd4
...
...
@@ -61,7 +61,7 @@ class OpConverter {
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
,
"mul"
};
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1UL
);
int
op_type_len
=
op_desc
.
Type
().
size
();
std
::
string
op_type
=
op_desc
.
Type
().
substr
(
op_type_len
-
3
,
op_type_len
);
...
...
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -13,25 +13,57 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
void
DealCeilMode
(
const
nvinfer1
::
Dims
&
input_shape
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
nvinfer1
::
DimsHW
*
pre_pad
,
nvinfer1
::
DimsHW
*
post_pad
,
int
input_dims
)
{
int
input_height
=
input_shape
.
d
[
input_dims
-
2
];
int
input_width
=
input_shape
.
d
[
input_dims
-
1
];
int
floor_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
])
/
strides
[
0
]
+
1
;
int
ceil_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
/
strides
[
0
]
+
1
;
int
floor_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
])
/
strides
[
1
]
+
1
;
int
ceil_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
]
+
strides
[
1
]
-
1
)
/
strides
[
1
]
+
1
;
if
(
floor_h_output_size
!=
ceil_h_output_size
)
{
post_pad
->
h
()
=
strides
[
0
]
-
1
;
}
if
(
floor_w_output_size
!=
ceil_w_output_size
)
{
post_pad
->
w
()
=
strides
[
1
]
-
1
;
}
}
/*
* Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
*/
class
Pool2dOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
3
)
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
40
)
<<
"convert a fluid pool2d op to tensorrt pool2d layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
*
input1
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
auto
*
input1
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
nvinfer1
::
Dims
input_shape
=
input1
->
getDimensions
();
int
input_dims
=
input_shape
.
nbDims
;
PADDLE_ENFORCE_EQ
(
input_dims
,
3UL
);
bool
global_pooling
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"global_pooling"
));
std
::
string
pool_type
=
...
...
@@ -44,23 +76,6 @@ class Pool2dOpConverter : public OpConverter {
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
bool
ceil_mode
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"ceil_mode"
));
nvinfer1
::
Dims
input_shape
=
input1
->
getDimensions
();
int
nbDims
=
input_shape
.
nbDims
;
nvinfer1
::
DimsHW
nv_ksize
(
ksize
[
0
],
ksize
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
if
(
global_pooling
==
true
)
{
nv_ksize
.
d
[
0
]
=
input_shape
.
d
[
nbDims
-
2
];
nv_ksize
.
d
[
1
]
=
input_shape
.
d
[
nbDims
-
1
];
nv_strides
.
h
()
=
1
;
nv_strides
.
w
()
=
1
;
nv_paddings
.
h
()
=
0
;
nv_paddings
.
w
()
=
0
;
}
PADDLE_ENFORCE_EQ
(
input1
->
getDimensions
().
nbDims
,
3UL
);
nvinfer1
::
PoolingType
nv_pool_type
=
nvinfer1
::
PoolingType
::
kMAX
;
if
(
pool_type
==
"max"
)
{
nv_pool_type
=
nvinfer1
::
PoolingType
::
kMAX
;
...
...
@@ -70,42 +85,63 @@ class Pool2dOpConverter : public OpConverter {
PADDLE_THROW
(
"TensorRT unsupported pooling type!"
);
}
if
(
ceil_mode
)
{
nvinfer1
::
DimsHW
pre_pad
(
0
,
0
);
nvinfer1
::
DimsHW
post_pad
(
0
,
0
);
int
input_height
=
input_shape
.
d
[
nbDims
-
2
];
int
input_width
=
input_shape
.
d
[
nbDims
-
1
];
int
floor_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
])
/
strides
[
0
]
+
1
;
int
ceil_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
/
strides
[
0
]
+
1
;
nvinfer1
::
DimsHW
nv_ksize
(
ksize
[
0
],
ksize
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
int
floor_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
])
/
strides
[
1
]
+
1
;
int
ceil_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
]
+
strides
[
1
]
-
1
)
/
strides
[
1
]
+
1
;
if
(
floor_h_output_size
!=
ceil_h_output_size
)
{
post_pad
.
h
()
=
strides
[
0
]
-
1
;
}
nvinfer1
::
ILayer
*
layer
=
nullptr
;
if
(
floor_w_output_size
!=
ceil_w_output_size
)
{
post_pad
.
w
()
=
strides
[
1
]
-
1
;
if
(
global_pooling
==
true
)
{
nv_ksize
.
d
[
0
]
=
input_shape
.
d
[
input_dims
-
2
];
nv_ksize
.
d
[
1
]
=
input_shape
.
d
[
input_dims
-
1
];
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
nv_pool_type
,
nv_ksize
);
PADDLE_ENFORCE_NOT_NULL
(
layer
,
"pool layer could not be created."
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"pool2d (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
return
;
}
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Padding
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
pre_pad
,
if
(
pool_type
==
"max"
)
{
nvinfer1
::
DimsHW
pre_pad
(
paddings
[
0
],
paddings
[
1
]);
nvinfer1
::
DimsHW
post_pad
(
paddings
[
0
],
paddings
[
1
]);
if
(
ceil_mode
)
{
// If ceil mode is true, we will pad the appropriate size to the input.
DealCeilMode
(
input_shape
,
ksize
,
strides
,
paddings
,
&
pre_pad
,
&
post_pad
,
input_dims
);
auto
*
pad_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Padding
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
pre_pad
,
post_pad
);
input1
=
layer
->
getOutput
(
0
);
PADDLE_ENFORCE_NOT_NULL
(
pad_layer
,
"pad layer in poolOp converter could not be created."
);
input1
=
pad_layer
->
getOutput
(
0
);
}
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
auto
*
pool_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
nv_pool_type
,
nv_ksize
);
PADDLE_ENFORCE_NOT_NULL
(
layer
,
"pool layer could not be created."
);
layer
->
setStride
(
nv_strides
);
layer
->
setPadding
(
nv_paddings
);
PADDLE_ENFORCE_NOT_NULL
(
pool_layer
,
"pool layer could not be created."
);
pool_layer
->
setStride
(
nv_strides
);
pool_layer
->
setPadding
(
nv_paddings
);
layer
=
pool_layer
;
}
else
{
// Average pooling needs to exclude the padding pixels from the average
// mean.
// It is not supported well by TRT, we use a plugin here.
std
::
vector
<
int
>
input_shape_v
;
for
(
int
i
=
0
;
i
<
input_dims
;
i
++
)
{
input_shape_v
.
push_back
(
input_shape
.
d
[
i
]);
}
plugin
::
AvgPoolPlugin
*
plugin
=
new
plugin
::
AvgPoolPlugin
(
ceil_mode
,
ksize
,
strides
,
paddings
,
input_shape_v
);
auto
*
avg_pool_layer
=
engine_
->
AddPlugin
(
&
input1
,
1
,
plugin
);
layer
=
avg_pool_layer
;
}
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"pool2d (Output: "
+
output_name
+
")"
).
c_str
());
...
...
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
TensorRTEngine
::
Weight
alpha_rt
(
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
alpha_data
),
alpha_tensor_device
->
numel
());
PReluPlugin
*
plugin
=
new
PReluPlugin
(
alpha_rt
,
mode
);
plugin
::
PReluPlugin
*
plugin
=
new
plugin
::
PReluPlugin
(
alpha_rt
,
mode
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
// keep alpha tensor to avoid release it's memory
...
...
paddle/fluid/inference/tensorrt/convert/split_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -19,9 +19,6 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
/*
* SplitOp.
*/
class
SplitOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
...
...
@@ -40,17 +37,12 @@ class SplitOpConverter : public OpConverter {
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
std
::
vector
<
int
>
output_lengths
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
// split on batch is not supported in TensorRT
PADDLE_ENFORCE
(
axis
!=
0
);
if
(
axis
<
0
)
{
axis
+=
input_dims
.
nbDims
;
}
else
{
axis
-=
1
;
}
axis
+=
(
axis
<
0
)
?
input_dims
.
nbDims
:
-
1
;
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
SplitPlugin
*
plugin
=
new
SplitPlugin
(
axis
,
output_lengths
);
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
...
...
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -20,13 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
TEST
(
elementwise_op
,
add_weight
_test
)
{
TEST
(
elementwise_op
,
add_weight
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"elementwise_add-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclParamVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
...
...
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
validator
.
Execute
(
8
);
}
TEST
(
elementwise_op
,
add_tensor_test
)
{
TEST
(
elementwise_op
,
native
)
{
for
(
std
::
string
type
:
{
"add"
,
"mul"
})
{
int
batch_size
=
8
;
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
8
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
TRTConvertValidation
validator
(
batch_size
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
validator
.
DeclOutputVar
(
"elementwise_"
+
type
+
"-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add
-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add
-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add
-Out"
});
desc
.
SetType
(
"elementwise_"
+
type
);
desc
.
SetInput
(
"X"
,
{
"elementwise_"
+
type
+
"
-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_"
+
type
+
"
-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_"
+
type
+
"
-Out"
});
// the defalut axis of elementwise op is -1
int
axis
=
-
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
batch_size
);
}
}
validator
.
Execute
(
8
);
TEST
(
elementwise_op
,
plugin
)
{
for
(
std
::
string
type
:
{
"add"
,
"mul"
})
{
int
batch_size
=
8
;
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
batch_size
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_"
+
type
+
"-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
validator
.
DeclOutputVar
(
"elementwise_"
+
type
+
"-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_"
+
type
);
desc
.
SetInput
(
"X"
,
{
"elementwise_"
+
type
+
"-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_"
+
type
+
"-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_"
+
type
+
"-Out"
});
int
axis
=
-
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
batch_size
);
}
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
elementwise_add
);
USE_OP
(
elementwise_mul
);
paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
leaky_relu_op
,
test_leaky_relu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"leaky_relu_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"leaky_relu_out"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"leaky_relu"
);
desc
.
SetInput
(
"X"
,
{
"leaky_relu_input"
});
desc
.
SetOutput
(
"Out"
,
{
"leaky_relu_out"
});
desc
.
SetAttr
(
"alpha"
,
0.1
f
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
// USE_OP(leaky_relu);
USE_OP
(
leaky_relu
);
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
...
...
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -20,20 +20,21 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
void
test_pool2d
(
bool
global_pooling
,
bool
ceil_mode
)
{
void
test_pool2d
(
bool
global_pooling
,
bool
ceil_mode
,
std
::
string
pool_type
=
"max"
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
5
,
parameters
,
scope
,
1
<<
15
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator
.
DeclInputVar
(
"pool2d-X"
,
nvinfer1
::
Dims3
(
3
,
13
,
14
));
validator
.
DeclInputVar
(
"pool2d-X"
,
nvinfer1
::
Dims3
(
3
,
6
,
7
));
if
(
global_pooling
)
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
1
,
1
));
else
if
(
ceil_mode
)
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
6
,
7
));
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
3
,
4
));
else
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
6
,
6
));
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
...
...
@@ -41,10 +42,10 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
desc
.
SetInput
(
"X"
,
{
"pool2d-X"
});
desc
.
SetOutput
(
"Out"
,
{
"pool2d-Out"
});
std
::
vector
<
int
>
ksize
({
3
,
3
});
std
::
vector
<
int
>
ksize
({
2
,
2
});
std
::
vector
<
int
>
strides
({
2
,
2
});
std
::
vector
<
int
>
paddings
({
0
,
0
});
std
::
string
pooling_t
=
"max"
;
std
::
string
pooling_t
=
pool_type
;
desc
.
SetAttr
(
"pooling_type"
,
pooling_t
);
desc
.
SetAttr
(
"ksize"
,
ksize
);
...
...
@@ -63,7 +64,8 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
test_pool2d
(
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_ceil_mode
)
{
test_pool2d
(
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
max_ceil_test
)
{
test_pool2d
(
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test
)
{
test_pool2d
(
false
,
true
,
"avg"
);
}
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -20,30 +20,92 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
TEST
(
split_op
,
test
)
{
template
<
int
BatchSize
,
int
Axis
>
void
TensorRTSplitTest
(
const
std
::
vector
<
int
>
&
in_shape
,
const
std
::
vector
<
int
>
&
sections
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"split_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out1"
,
nvinfer1
::
DimsCHW
(
2
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out2"
,
nvinfer1
::
DimsCHW
(
1
,
2
,
2
));
TRTConvertValidation
validator
(
BatchSize
+
1
,
parameters
,
scope
,
10000
);
auto
make_dim
=
[](
const
std
::
vector
<
int
>
&
shape
)
{
nvinfer1
::
DimsCHW
dim
;
dim
.
c
()
=
shape
[
0
];
dim
.
h
()
=
shape
[
1
];
dim
.
w
()
=
shape
[
2
];
return
dim
;
};
validator
.
DeclInputVar
(
"split_input"
,
make_dim
(
in_shape
));
std
::
vector
<
std
::
string
>
output_vars
;
for
(
size_t
i
=
0
;
i
<
sections
.
size
();
++
i
)
{
auto
out_shape
=
in_shape
;
out_shape
[
Axis
-
1
]
=
sections
[
i
];
std
::
string
output_name
=
"split_out"
+
std
::
to_string
(
i
);
validator
.
DeclOutputVar
(
output_name
,
make_dim
(
out_shape
));
output_vars
.
push_back
(
output_name
);
}
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"split"
);
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetOutput
(
"Out"
,
{
"split_out1"
,
"split_out2"
}
);
desc
.
SetOutput
(
"Out"
,
output_vars
);
int
num
=
0
;
int
axis
=
1
;
std
::
vector
<
int
>
output_lengths
=
{
2
,
1
};
desc
.
SetAttr
(
"axis"
,
axis
);
desc
.
SetAttr
(
"num"
,
num
);
desc
.
SetAttr
(
"sections"
,
output_lengths
);
desc
.
SetAttr
(
"axis"
,
Axis
);
desc
.
SetAttr
(
"num"
,
0
);
desc
.
SetAttr
(
"sections"
,
sections
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
validator
.
Execute
(
BatchSize
);
}
// batch = 0, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 0, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
// batch = 10, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 10, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
b5c44fd4
...
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
b5c44fd4
...
...
@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
}
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
nbInputs
,
PluginTensorRT
*
plugin
)
{
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
plugin
)
{
owned_plugin_
.
emplace_back
(
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
n
bI
nputs
,
*
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
n
um_i
nputs
,
*
plugin
);
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
b5c44fd4
...
...
@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
int
GetRuntimeBatch
();
int
GetDevice
()
{
return
device_
;
}
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
n
bInputs
,
PluginTensorRT
*
);
int
n
um_inputs
,
plugin
::
PluginTensorRT
*
);
// A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage.
...
...
@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
std
::
vector
<
std
::
unique_ptr
<
PluginTensorRT
>>
owned_plugin_
;
std
::
vector
<
std
::
unique_ptr
<
plugin
::
PluginTensorRT
>>
owned_plugin_
;
// TensorRT related internal members
template
<
typename
T
>
...
...
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
浏览文件 @
b5c44fd4
nv_library
(
tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context
)
nv_library
(
tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine
)
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
nvinfer1
::
Dims
AvgPoolPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
assert
(
index
==
0
);
assert
(
inputDims
[
0
].
nbDims
==
3
);
nvinfer1
::
Dims
const
&
input_dims
=
inputDims
[
0
];
nvinfer1
::
Dims
output_dims
=
input_dims
;
output_dims
.
d
[
1
]
=
output_shape_
[
1
];
output_dims
.
d
[
2
]
=
output_shape_
[
2
];
return
output_dims
;
}
int
AvgPoolPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
int
input_size
=
0
;
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
float
**
odatas
=
reinterpret_cast
<
float
**>
(
outputs
);
paddle
::
operators
::
math
::
AvgPool
<
float
>
pool_process
;
paddle
::
operators
::
math
::
Pool2dDirectCUDAFunctor
<
paddle
::
operators
::
math
::
AvgPool
<
float
>
,
float
>
pool2d_forward
;
std
::
vector
<
int
>
input_shape
=
input_shape_
;
std
::
vector
<
int
>
output_shape
=
output_shape_
;
input_shape
.
insert
(
input_shape
.
begin
(),
batchSize
);
output_shape
.
insert
(
output_shape
.
begin
(),
batchSize
);
pool2d_forward
(
idata
,
input_shape
,
output_shape
,
ksize_
,
strides_
,
paddings_
,
pool_process
,
true
,
odatas
[
0
],
stream
);
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
AvgPoolPlugin
:
public
PluginTensorRT
{
private:
bool
ceil_mode_
;
std
::
vector
<
int
>
ksize_
;
std
::
vector
<
int
>
strides_
;
std
::
vector
<
int
>
paddings_
;
std
::
vector
<
int
>
input_shape_
;
std
::
vector
<
int
>
output_shape_
;
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
ceil_mode_
)
+
SerializedSize
(
ksize_
)
+
SerializedSize
(
strides_
)
+
SerializedSize
(
paddings_
)
+
SerializedSize
(
input_shape_
)
+
getBaseSerializationSize
();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
ceil_mode_
);
SerializeValue
(
&
buffer
,
ksize_
);
SerializeValue
(
&
buffer
,
strides_
);
SerializeValue
(
&
buffer
,
paddings_
);
SerializeValue
(
&
buffer
,
input_shape_
);
}
public:
AvgPoolPlugin
(
bool
ceil_mode
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
std
::
vector
<
int
>
input_shape
)
:
ceil_mode_
(
ceil_mode
),
ksize_
(
ksize
),
strides_
(
strides
),
paddings_
(
paddings
),
input_shape_
(
input_shape
)
{
int
output_h
,
output_w
;
output_shape_
=
input_shape_
;
if
(
!
ceil_mode_
)
{
output_h
=
(
input_shape
[
1
]
-
ksize_
[
0
]
+
2
*
paddings_
[
0
])
/
strides_
[
0
]
+
1
;
output_w
=
(
input_shape
[
2
]
-
ksize_
[
1
]
+
2
*
paddings_
[
1
])
/
strides_
[
1
]
+
1
;
}
else
{
output_h
=
(
input_shape
[
1
]
-
ksize_
[
0
]
+
2
*
paddings_
[
0
]
+
strides_
[
0
]
-
1
)
/
strides_
[
0
]
+
1
;
output_w
=
(
input_shape
[
2
]
-
ksize_
[
1
]
+
2
*
paddings_
[
1
]
+
strides_
[
1
]
-
1
)
/
strides_
[
1
]
+
1
;
}
output_shape_
[
1
]
=
output_h
;
output_shape_
[
2
]
=
output_w
;
}
// It was used for tensorrt deserialization.
// It should not be called by users.
AvgPoolPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
deserializeBase
(
serialData
,
serialLength
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
ceil_mode_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
ksize_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
strides_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
paddings_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
input_shape_
);
}
AvgPoolPlugin
*
clone
()
const
override
{
return
new
AvgPoolPlugin
(
ceil_mode_
,
ksize_
,
strides_
,
paddings_
,
input_shape_
);
}
const
char
*
getPluginType
()
const
override
{
return
"avg_pool"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
int
initialize
()
override
{
return
0
;
}
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
namespace
details
{
template
<
typename
T
>
struct
Add
{
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
+
b
;
}
};
template
<
typename
T
>
struct
Mul
{
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
};
template
<
typename
T
,
typename
Operator
>
__global__
void
ColumnWiseKernel
(
Operator
op
,
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
batch_size
,
int
num_rows
,
int
num_cols
)
{
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
++
batch_id
)
{
int
row
=
blockIdx
.
x
;
for
(;
row
<
num_rows
;
row
+=
gridDim
.
x
)
{
T
value_y
=
y
[
batch_id
*
num_rows
+
row
];
int
col
=
threadIdx
.
x
;
int
offset
=
(
batch_id
*
num_rows
+
row
)
*
num_cols
;
for
(;
col
<
num_cols
;
col
+=
blockDim
.
x
)
{
T
value_x
=
x
[
offset
+
col
];
out
[
offset
+
col
]
=
op
(
value_x
,
value_y
);
}
}
}
}
template
<
typename
T
,
typename
Operator
>
static
void
ElementWise
(
Operator
op
,
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
batch_size
,
int
prev
,
int
midd
,
int
post
,
cudaStream_t
stream
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaximumBlocks
=
65535
;
if
(
prev
==
1
)
{
int
num_threads
=
(
post
>
kThreadsPerBlock
)
?
kThreadsPerBlock
:
(((
post
+
31
)
>>
5
)
<<
5
);
int
num_blocks
=
(
midd
<
kMaximumBlocks
)
?
midd
:
kMaximumBlocks
;
ColumnWiseKernel
<<<
num_blocks
,
num_threads
,
0
,
stream
>>>
(
op
,
x
,
y
,
out
,
batch_size
,
midd
,
post
);
}
else
if
(
post
==
1
)
{
PADDLE_THROW
(
"Not implemented."
);
}
else
{
PADDLE_THROW
(
"Not implemented."
);
}
}
}
// namespace details
nvinfer1
::
Dims
ElementWisePlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
index
,
0
);
PADDLE_ENFORCE_EQ
(
num_inputs
,
2
);
PADDLE_ENFORCE_NOT_NULL
(
input_dims
);
return
input_dims
[
0
];
}
int
ElementWisePlugin
::
initialize
()
{
PADDLE_ENFORCE_GT
(
dims_y_
.
nbDims
,
0
);
axis_
=
(
axis_
==
-
1
)
?
dims_x_
.
nbDims
-
dims_y_
.
nbDims
:
axis_
;
int
trimed_nb_dims
=
dims_y_
.
nbDims
;
for
(;
trimed_nb_dims
>
0
;
--
trimed_nb_dims
)
{
if
(
dims_y_
.
d
[
trimed_nb_dims
-
1
]
!=
1
)
{
break
;
}
}
dims_y_
.
nbDims
=
trimed_nb_dims
;
PADDLE_ENFORCE_GE
(
dims_x_
.
nbDims
,
dims_y_
.
nbDims
+
axis_
);
PADDLE_ENFORCE_LT
(
axis_
,
dims_x_
.
nbDims
);
prev_size_
=
1
;
midd_size_
=
1
;
post_size_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
prev_size_
*=
dims_x_
.
d
[
i
];
}
for
(
int
i
=
0
;
i
<
dims_y_
.
nbDims
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
dims_x_
.
d
[
i
+
axis_
],
dims_y_
.
d
[
i
],
"Broadcast dimension mismatch."
);
midd_size_
*=
dims_y_
.
d
[
i
];
}
for
(
int
i
=
axis_
+
dims_y_
.
nbDims
;
i
<
dims_x_
.
nbDims
;
++
i
)
{
post_size_
*=
dims_x_
.
d
[
i
];
}
return
0
;
}
int
ElementWisePlugin
::
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
const
float
*
x
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
const
float
*
y
=
reinterpret_cast
<
const
float
*>
(
inputs
[
1
]);
float
*
out
=
reinterpret_cast
<
float
*>
(
outputs
[
0
]);
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kSUM
)
{
details
::
ElementWise
(
details
::
Add
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kPROD
)
{
details
::
ElementWise
(
details
::
Mul
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
{
PADDLE_THROW
(
"Not implemented."
);
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
ElementWisePlugin
:
public
PluginTensorRT
{
public:
ElementWisePlugin
(
nvinfer1
::
ElementWiseOperation
type
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_y
,
int
axis
)
:
type_
(
type
),
dims_x_
(
dims_x
),
dims_y_
(
dims_y
),
axis_
(
axis
),
prev_size_
(
1
),
midd_size_
(
1
),
post_size_
(
1
)
{}
ElementWisePlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_x_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_y_
);
}
ElementWisePlugin
*
clone
()
const
override
{
// return new ElementWisePlugin(dims_x_, dims_y_, axis_);
return
nullptr
;
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise"
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
override
;
int
initialize
()
override
;
// execute the layer
int
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
);
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_x_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
}
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
dims_x_
);
SerializeValue
(
&
buffer
,
dims_y_
);
}
nvinfer1
::
ElementWiseOperation
type_
;
nvinfer1
::
Dims
dims_x_
;
nvinfer1
::
Dims
dims_y_
;
int
axis_
;
int
prev_size_
;
int
midd_size_
;
int
post_size_
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
浏览文件 @
b5c44fd4
...
...
@@ -20,6 +20,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
static
const
int
CUDA_NUM_THREADS
=
1024
;
static
const
int
CUDA_MAX_NUM_BLOCKS
=
65535
;
...
...
@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
浏览文件 @
b5c44fd4
...
...
@@ -21,6 +21,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PReluPlugin
:
public
PluginTensorRT
{
TensorRTEngine
::
Weight
alpha_
;
...
...
@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
void
*
workspace
,
cudaStream_t
stream
)
override
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/serialize.h
浏览文件 @
b5c44fd4
...
...
@@ -14,10 +14,15 @@
#pragma once
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
...
...
@@ -26,7 +31,7 @@ template <typename T>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
);
namespace
{
namespace
details
{
template
<
typename
T
,
class
Enable
=
void
>
struct
Serializer
{};
...
...
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
SerializedSize
(
T
const
&
value
)
{
return
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
T
const
&
value
)
{
std
::
memcpy
(
*
buffer
,
&
value
,
sizeof
(
T
));
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
sizeof
(
T
);
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
assert
(
*
buffer_size
>=
sizeof
(
T
));
std
::
memcpy
(
value
,
*
buffer
,
sizeof
(
T
));
...
...
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
template
<
>
struct
Serializer
<
const
char
*>
{
static
size_t
SerializedSize
(
const
char
*
value
)
{
return
strlen
(
value
)
+
1
;
}
static
void
Serialize
(
void
**
buffer
,
const
char
*
value
)
{
std
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
std
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
// NOLINT
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
strlen
(
value
)
+
1
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
const
char
**
value
)
{
*
value
=
static_cast
<
char
const
*>
(
*
buffer
);
...
...
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
static
size_t
SerializedSize
(
std
::
vector
<
T
>
const
&
value
)
{
return
sizeof
(
value
.
size
())
+
value
.
size
()
*
sizeof
(
T
);
}
static
void
Serialize
(
void
**
buffer
,
std
::
vector
<
T
>
const
&
value
)
{
SerializeValue
(
buffer
,
value
.
size
());
size_t
nbyte
=
value
.
size
()
*
sizeof
(
T
);
std
::
memcpy
(
*
buffer
,
value
.
data
(),
nbyte
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
nbyte
;
}
static
void
Deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
std
::
vector
<
T
>*
value
)
{
size_t
size
;
DeserializeValue
(
buffer
,
buffer_size
,
&
size
);
value
->
resize
(
size
);
size_t
nbyte
=
value
->
size
()
*
sizeof
(
T
);
assert
(
*
buffer_size
>=
nbyte
);
PADDLE_ENFORCE_GE
(
*
buffer_size
,
nbyte
);
std
::
memcpy
(
value
->
data
(),
*
buffer
,
nbyte
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
nbyte
;
*
buffer_size
-=
nbyte
;
}
};
}
// namespace
}
// namespace
details
template
<
typename
T
>
inline
size_t
SerializedSize
(
T
const
&
value
)
{
return
Serializer
<
T
>::
SerializedSize
(
value
);
return
details
::
Serializer
<
T
>::
SerializedSize
(
value
);
}
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
)
{
return
Serializer
<
T
>::
Serialize
(
buffer
,
value
);
return
details
::
Serializer
<
T
>::
Serialize
(
buffer
,
value
);
}
template
<
typename
T
>
inline
void
DeserializeValue
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
return
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
return
details
::
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
b5c44fd4
...
...
@@ -12,70 +12,167 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <
stdio
.h>
#include <
cassert
>
#include <
cuda_fp16
.h>
#include <
algorithm
>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
assert
(
index
<
this
->
getNbOutputs
());
nvinfer1
::
Dims
const
&
input_dims
=
inputDims
[
0
];
nvinfer1
::
Dims
output_dims
=
input_dims
;
// copied from operators::math::SplitFunctor
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
*
out_cols
,
int
out_cols_size
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
curr_segment
=
0
;
int
curr_offset
=
out_cols
[
0
];
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
while
(
curr_col_offset
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs_data
[
curr_segment
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
fixed_out_col
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
/
fixed_out_col
;
int
in_offset
=
tid_x
-
split
*
fixed_out_col
;
T
*
output_ptr
=
outputs_data
[
split
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
PADDLE_ENFORCE_LT
(
index
,
this
->
getNbOutputs
());
nvinfer1
::
Dims
output_dims
=
input_dims
[
0
];
output_dims
.
d
[
axis_
]
=
output_length_
.
at
(
index
);
return
output_dims
;
}
int
SplitPlugin
::
initialize
()
{
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
// notice input dims is [C, H, W]
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
outer_rows_
=
1
;
inner_cols_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
outer_rows_
*=
dims
.
d
[
i
];
}
for
(
int
i
=
axis_
+
1
;
i
<
dims
.
nbDims
;
++
i
)
{
inner_cols_
*=
dims
.
d
[
i
];
}
same_shape_
=
true
;
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
}
segment_offsets_
=
segment_offsets
;
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
nx_
=
1
;
for
(
int
i
=
dims
.
nbDims
-
1
;
i
>
axis_
;
--
i
)
{
nx_
*=
dims
.
d
[
i
];
if
(
output_length_
[
i
]
!=
output_length_
[
0
])
{
same_shape_
=
false
;
}
ny_
=
dims
.
d
[
axis_
];
nz_
=
1
;
for
(
int
i
=
axis_
-
1
;
i
>=
0
;
--
i
)
{
nz_
*=
dims
.
d
[
i
];
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]
*
inner_cols_
);
}
inner_cols_
*=
dims
.
d
[
axis_
];
d_segment_offsets_
=
segment_offsets
;
segment_offsets_
=
std
::
move
(
segment_offsets
);
d_output_ptrs_
.
resize
(
this
->
getNbOutputs
(),
nullptr
);
return
0
;
}
template
<
typename
T
>
inline
void
Split
(
cudaStream_t
stream
,
const
bool
same_shape
,
const
int
outer_rows
,
const
int
inner_cols
,
const
std
::
vector
<
int
>&
segment_offsets
,
const
int
*
d_segment_offsets
,
const
T
*
input
,
T
**
outputs
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaxBlocks
=
65535
;
int
block_cols
=
kThreadsPerBlock
;
if
(
inner_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
inner_cols
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_cols
=
std
::
min
((
inner_cols
+
block_cols
-
1
)
/
block_cols
,
kMaxBlocks
);
int
grid_rows
=
std
::
min
(
kMaxBlocks
/
grid_cols
,
std
::
max
(
outer_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
same_shape
)
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
segment_offsets
[
1
],
outputs
);
}
else
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
d_segment_offsets
,
static_cast
<
int
>
(
segment_offsets
.
size
()),
outputs
);
}
}
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
i
nt
input_size
=
0
;
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
float
**
odata
s
=
reinterpret_cast
<
float
**>
(
outputs
);
// kernel impl here.
int
inputBatchOffset
=
nx_
*
ny_
*
nz_
;
for
(
size_t
i
=
0
;
i
<
this
->
getNbOutputs
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
batchSize
;
j
++
)
{
float
const
*
input_ptr
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]
);
i
f
(((
batchSize
==
1
&&
axis_
==
0
)
||
axis_
==
-
1
)
&&
this
->
getNbOutputs
()
<
10
)
{
float
**
output_ptr
s
=
reinterpret_cast
<
float
**>
(
outputs
);
int
data_type_size
=
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
?
sizeof
(
float
)
:
sizeof
(
__half
)
;
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
odatas
[
i
]
+
j
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
inputs
[
0
]
+
(
inputBatchOffset
*
j
+
segment_offsets_
[
i
]
*
nx_
)
*
sizeof
(
float
),
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
output_ptrs
[
i
],
input_ptr
+
segment_offsets_
[
i
],
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
data_type_size
,
cudaMemcpyDeviceToDevice
,
stream
)
==
cudaSuccess
);
}
}
else
{
outer_rows_
*=
batchSize
;
const
int
*
d_segment_offsets_ptr
=
thrust
::
raw_pointer_cast
(
&
d_segment_offsets_
[
0
]);
float
**
output_ptrs
=
thrust
::
raw_pointer_cast
(
&
d_output_ptrs_
[
0
]);
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
,
outputs
,
this
->
getNbOutputs
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
,
stream
)
==
cudaSuccess
);
if
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
input_ptr
,
output_ptrs
);
}
else
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
(
__half
*
)
input_ptr
,
// NOLINT
(
__half
**
)
output_ptrs
);
// NOLINT
}
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
}
// tensorrt
}
// inference
}
// paddle
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
b5c44fd4
...
...
@@ -14,61 +14,63 @@
#pragma once
#include <thrust/device_vector.h>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
SplitPlugin
:
public
PluginTensorRT
{
int
axis_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
output_length_
);
}
SplitPlugin
*
clone
()
const
override
{
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
override
;
int
initialize
()
override
;
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
protected:
virtual
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
output_length_
)
+
getBaseSerializationSize
();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
virtual
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
output_length_
);
}
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{
assert
(
axis
<=
nvinfer1
::
Dims
::
MAX_DIMS
);
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
deserializeBase
(
serialData
,
serialLength
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
axis_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
output_length_
);
}
SplitPlugin
*
clone
()
const
override
{
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
virtual
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
virtual
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
virtual
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
virtual
int
initialize
()
override
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
int
axis_
;
int
outer_rows_
;
int
inner_cols_
;
bool
same_shape_
;
std
::
vector
<
int
>
output_length_
;
std
::
vector
<
int
>
segment_offsets_
;
thrust
::
device_vector
<
int
>
d_segment_offsets_
;
thrust
::
device_vector
<
float
*>
d_output_ptrs_
;
};
}
// tensorrt
}
// inference
}
// paddle
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
浏览文件 @
b5c44fd4
...
...
@@ -17,6 +17,7 @@
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
void
PluginTensorRT
::
serializeBase
(
void
*&
buffer
)
{
SerializeValue
(
&
buffer
,
input_dims_
);
...
...
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue
(
&
buffer
,
data_format_
);
}
void
PluginTensorRT
::
deserializeBase
(
void
const
*&
serial
D
ata
,
size_t
&
serial
L
ength
)
{
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
input_dims_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
max_batch_size_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
data_type_
);
DeserializeValue
(
&
serial
Data
,
&
serialL
ength
,
&
data_format_
);
void
PluginTensorRT
::
deserializeBase
(
void
const
*&
serial
_d
ata
,
size_t
&
serial
_l
ength
)
{
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
input_dims_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
max_batch_size_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
data_type_
);
DeserializeValue
(
&
serial
_data
,
&
serial_l
ength
,
&
data_format_
);
}
size_t
PluginTensorRT
::
getBaseSerializationSize
()
{
...
...
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
(
format
==
nvinfer1
::
PluginFormat
::
kNCHW
));
}
void
PluginTensorRT
::
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
{
void
PluginTensorRT
::
configureWithFormat
(
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
,
const
nvinfer1
::
Dims
*
output_dims
,
int
num_outputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
max_batch_size
)
{
data_type_
=
type
;
data_format_
=
format
;
input_dims_
.
assign
(
input
Dims
,
inputDims
+
nbI
nputs
);
max_batch_size_
=
max
BatchS
ize
;
input_dims_
.
assign
(
input
_dims
,
input_dims
+
num_i
nputs
);
max_batch_size_
=
max
_batch_s
ize
;
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
浏览文件 @
b5c44fd4
...
...
@@ -14,23 +14,30 @@
#pragma once
#include <
cassert
>
#include <
NvInfer.h
>
#include <cstring>
#include <iostream>
#include <unordered_map>
#include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
public:
PluginTensorRT
()
{}
// It was used for TensorRT deserialization.
// It should not be called by users.
PluginTensorRT
(
const
void
*
serialized_data
,
size_t
length
)
{}
virtual
~
PluginTensorRT
()
{}
nvinfer1
::
Dims
const
&
getInputDims
(
int
index
)
const
{
return
input_dims_
.
at
(
index
);
}
...
...
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
nvinfer1
::
DataType
getDataType
()
const
{
return
data_type_
;
}
nvinfer1
::
PluginFormat
getDataFormat
()
const
{
return
data_format_
;
}
virtual
const
char
*
getPluginVersion
()
const
{
return
"1"
;
}
void
AddInput
(
nvinfer1
::
ITensor
*
input
)
{
inputs_
.
push_back
(
input
);
}
std
::
vector
<
nvinfer1
::
ITensor
*>&
GetInputs
()
{
return
inputs_
;
}
virtual
nvinfer1
::
IPluginExt
*
clone
()
const
=
0
;
virtual
const
char
*
getPluginType
()
const
=
0
;
// Following functions are inherit from nvinfer1::IPluginExt
// Get the number of outputs from the layer
int
getNbOutputs
()
const
{
return
1
;
}
// Get the dimension of an output tensor
virtual
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
=
0
;
// Find the workspace size required by the layer
size_t
getWorkspaceSize
(
int
)
const
override
{
return
0
;
}
// Initialize the layer for execution.
// This is called when the engine is created.
int
initialize
()
override
{
return
0
;
}
// Shutdown the layer. This is called when the engine is destroyed
void
terminate
()
override
{}
virtual
~
PluginTensorRT
()
{}
// Execute the layer
virtual
int
enqueue
(
int
batch_size
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
// Find the size of the serialization buffer required
virtual
size_t
getSerializationSize
()
=
0
;
// Serialize the layer config to buffer.
// TensorRT will call this func to serialize the configuration of TensorRT
// engine. It should not be called by users.
virtual
void
serialize
(
void
*
buffer
)
=
0
;
// Check format support. The default is FLOAT32 and NCHW.
bool
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
override
;
void
configureWithFormat
(
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
,
const
nvinfer1
::
Dims
*
outputDims
,
int
nbOutputs
,
// Configure the layer
void
configureWithFormat
(
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
,
const
nvinfer1
::
Dims
*
output_dims
,
int
num_outputs
,
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
,
int
maxBatchSize
)
override
;
// *NOTE* The following functions need to be overrided in the subclass.
virtual
nvinfer1
::
IPluginExt
*
clone
()
const
=
0
;
virtual
const
char
*
getPluginType
()
const
=
0
;
// Initialize the layer for execution. This is called when the engine is
// created.
int
initialize
()
override
{
return
0
;
}
// Serialize the layer config to buffer.
virtual
void
serialize
(
void
*
buffer
)
=
0
;
virtual
size_t
getSerializationSize
()
=
0
;
virtual
int
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
int
max_batch_size
)
override
;
protected:
// Deserialize input_dims, max_batch_size, data_type, data_format
void
deserializeBase
(
void
const
*&
serialData
,
size_t
&
serialLength
);
void
deserializeBase
(
void
const
*&
serial_data
,
// NOLINT
size_t
&
serial_length
);
// NOLINT
size_t
getBaseSerializationSize
();
// Serialize input_dims, max_batch_size, data_type, data_format
void
serializeBase
(
void
*&
buffer
);
void
serializeBase
(
void
*&
buffer
);
// NOLINT
std
::
vector
<
nvinfer1
::
Dims
>
input_dims_
;
size_t
max_batch_size_
;
nvinfer1
::
DataType
data_type_
;
nvinfer1
::
PluginFormat
data_format_
;
std
::
vector
<
nvinfer1
::
ITensor
*>
inputs_
;
};
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
b5c44fd4
set
(
INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
INFERENCE_EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
analysis
${
analysis_deps
}
ir_pass_manager analysis_predictor
)
endif
()
function
(
download_model install_dir model_name
)
if
(
NOT EXISTS
${
install_dir
}
)
inference_download_and_uncompress
(
${
install_dir
}
${
INFERENCE_URL
}
${
model_name
}
)
...
...
@@ -27,14 +31,14 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
endfunction
()
# RNN1
if
(
NOT APPLE
)
if
(
NOT APPLE
AND WITH_MKLML
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
download_model_and_data
(
${
RNN1_INSTALL_DIR
}
"rnn1%2Fmodel.tar.gz"
"rnn1%2Fdata.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
)
else
()
# TODO: fix this test on MACOS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS
message
(
WARNING
"These tests has been disabled in OSX before being fixed:
\n
test_analyzer_rnn1"
)
# TODO: fix this test on MACOS
and OPENBLAS
, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS
and OPENBLAS
message
(
WARNING
"These tests has been disabled in OSX
or WITH_MKL=OFF
before being fixed:
\n
test_analyzer_rnn1"
)
endif
()
# RNN2
...
...
@@ -109,6 +113,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_download_and_uncompress
(
${
TRT_MODEL_INSTALL_DIR
}
${
INFERENCE_URL
}
/tensorrt_test
"trt_test_models.tar.gz"
)
endif
()
inference_analysis_test
(
test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
analysis
${
analysis_deps
}
ir_pass_manager analysis_predictor
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRT_MODEL_INSTALL_DIR
}
/trt_test_models SERIAL
)
endif
()
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
b5c44fd4
...
...
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
b5c44fd4
...
...
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"specify_input_name: "
<<
config
.
specify_input_name
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"cpu_num_threads: "
<<
config
.
cpu_math_library_num_threads
()
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
b5c44fd4
...
...
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode."
);
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
inference
{
...
...
@@ -51,7 +52,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
return
;
}
LOG
(
INFO
)
<<
*
config
;
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
config
)
;
}
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
...
...
@@ -206,23 +207,40 @@ void TestMultiThreadPrediction(
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreateTestPredictor
(
config
,
use_analysis
));
for
(
int
tid
=
1
;
tid
<
num_threads
;
++
tid
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
auto
main_predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
size_t
total_time
{
0
};
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
static_cast
<
int
>
(
tid
)
+
1
);
#endif
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std
::
vector
<
PaddleTensor
>
outputs_tid
;
auto
&
predictor
=
predictors
[
tid
];
LOG
(
INFO
)
<<
"running thread "
<<
tid
;
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
main_predictor
->
Clone
();
#ifdef PADDLE_WITH_MKLDNN
if
(
use_analysis
)
{
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
SetMkldnnThreadID
(
static_cast
<
int
>
(
tid
)
+
1
);
}
#endif
// warmup run
LOG
(
INFO
)
<<
"Running thread "
<<
tid
<<
", warm up run..."
;
{
Timer
warmup_timer
;
warmup_timer
.
tic
();
predictor
->
Run
(
inputs
[
0
],
outputs
,
batch_size
);
PrintTime
(
batch_size
,
1
,
num_threads
,
tid
,
warmup_timer
.
toc
(),
1
);
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
}
#endif
}
LOG
(
INFO
)
<<
"Thread "
<<
tid
<<
" run "
<<
num_times
<<
" times..."
;
{
Timer
timer
;
timer
.
tic
();
for
(
int
i
=
0
;
i
<
num_times
;
i
++
)
{
...
...
@@ -235,6 +253,7 @@ void TestMultiThreadPrediction(
total_time
+=
time
;
PrintTime
(
batch_size
,
num_times
,
num_threads
,
tid
,
time
/
num_times
,
inputs
.
size
());
}
});
}
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
b5c44fd4
...
...
@@ -145,5 +145,3 @@ TEST(TensorRT_mobilenet, analysis) {
}
// namespace inference
}
// namespace paddle
USE_PASS
(
tensorrt_subgraph_pass
);
paddle/fluid/inference/utils/CMakeLists.txt
0 → 100644
浏览文件 @
b5c44fd4
cc_library
(
benchmark SRCS benchmark.cc DEPS enforce
)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
paddle/fluid/inference/utils/benchmark.cc
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
std
::
string
Benchmark
::
SerializeToString
()
const
{
std
::
stringstream
ss
;
ss
<<
"-----------------------------------------------------
\n
"
;
ss
<<
"name
\t
"
;
ss
<<
"batch_size
\t
"
;
ss
<<
"num_threads
\t
"
;
ss
<<
"latency
\t
"
;
ss
<<
"qps"
;
ss
<<
'\n'
;
ss
<<
name_
<<
"
\t
"
;
ss
<<
batch_size_
<<
"
\t
"
;
ss
<<
num_threads_
<<
"
\t
"
;
ss
<<
latency_
<<
"
\t
"
;
ss
<<
1000
/
latency_
;
ss
<<
'\n'
;
return
ss
.
str
();
}
void
Benchmark
::
PersistToFile
(
const
std
::
string
&
path
)
const
{
std
::
ofstream
file
(
path
,
std
::
ios
::
app
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"Can not open %s to add benchmark"
,
path
);
file
<<
SerializeToString
();
file
.
flush
();
file
.
close
();
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark.h
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
namespace
paddle
{
namespace
inference
{
/*
* Helper class to calculate the performance.
*/
struct
Benchmark
{
int
batch_size
()
const
{
return
batch_size_
;
}
void
SetBatchSize
(
int
x
)
{
batch_size_
=
x
;
}
int
num_threads
()
const
{
return
num_threads_
;
}
void
SetNumThreads
(
int
x
)
{
num_threads_
=
x
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
void
SetUseGpu
()
{
use_gpu_
=
true
;
}
int
latency
()
const
{
return
latency_
;
}
void
SetLatency
(
int
x
)
{
latency_
=
x
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
std
::
string
SerializeToString
()
const
;
void
PersistToFile
(
const
std
::
string
&
path
)
const
;
private:
bool
use_gpu_
{
false
};
int
batch_size_
{
0
};
int
latency_
;
int
num_threads_
{
1
};
std
::
string
name_
;
};
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark_tester.cc
0 → 100644
浏览文件 @
b5c44fd4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
using
namespace
paddle
::
inference
;
TEST
(
Benchmark
,
basic
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
LOG
(
INFO
)
<<
"benchmark:
\n
"
<<
benchmark
.
SerializeToString
();
}
TEST
(
Benchmark
,
PersistToFile
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
}
\ No newline at end of file
paddle/fluid/memory/allocation/best_fit_allocator_test.cc
浏览文件 @
b5c44fd4
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include <random>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
...
...
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
浏览文件 @
b5c44fd4
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <random>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
...
...
paddle/fluid/memory/allocation/cpu_allocator.h
浏览文件 @
b5c44fd4
...
...
@@ -15,6 +15,12 @@
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#ifdef _WIN32
#define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
add_subdirectory
(
distributed_ops
)
endif
()
if
(
NOT WIN32
)
add_subdirectory
(
reader
)
endif
()
add_subdirectory
(
reader
)
if
(
NOT WIN32
)
add_subdirectory
(
nccl
)
...
...
@@ -34,29 +32,39 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory
(
tensorrt
)
endif
()
register_operators
(
EXCLUDES warpctc_op conv_fusion_op
)
# warpctc_cudnn need cudnn 7 above
SET
(
OP_HEADER_DEPS xxhash
)
if
(
WITH_GPU
)
SET
(
OP_HEADER_DEPS
${
OP_HEADER_DEPS
}
cub
)
endif
()
register_operators
(
EXCLUDES warpctc_op conv_fusion_op DEPS
${
OP_HEADER_DEPS
}
)
# warpctc_op needs cudnn 7 above
if
(
WITH_GPU AND NOT WIN32
)
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc
)
else
()
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
endif
()
# conv_fusion_op needs cudnn 7 above
if
(
NOT
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
op_library
(
conv_fusion_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(conv2d_fusion);
\n
"
)
endif
()
else
()
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
endif
()
set
(
COMMON_OP_DEPS
""
)
set
(
COMMON_OP_DEPS
${
OP_HEADER_DEPS
}
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sample
r
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executo
r
)
if
(
NOT WIN32
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
endif
()
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv
cub
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv
)
endif
()
# FIXME(typhoonzero): operator deps may not needed.
...
...
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
b5c44fd4
...
...
@@ -22,6 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
namespace
paddle
{
namespace
operators
{
#if CUDNN_VERSION >= 7001
using
Tensor
=
framework
::
Tensor
;
using
ScopedTensorDescriptor
=
platform
::
ScopedTensorDescriptor
;
using
ScopedFilterDescriptor
=
platform
::
ScopedFilterDescriptor
;
...
...
@@ -178,10 +179,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
};
#endif
}
// namespace operators
}
// namespace paddle
#if CUDNN_VERSION >= 7001
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
conv2d_fusion
,
ops
::
CUDNNConvFusionOpKernel
<
float
>
,
ops
::
CUDNNConvFusionOpKernel
<
double
>
);
#endif
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
detection_library
(
mine_hard_examples_op SRCS mine_hard_examples_op.cc
)
detection_library
(
multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op.cu
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu
)
detection_library
(
anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu
)
detection_library
(
target_assign_op SRCS target_assign_op.cc
...
...
paddle/fluid/operators/detection/density_prior_box_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -39,17 +39,15 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
auto
fixed_sizes
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"densities"
);
bool
flatten
=
ctx
->
Attrs
().
Get
<
bool
>
(
"flatten_to_2d"
);
PADDLE_ENFORCE_EQ
(
fixed_sizes
.
size
(),
densities
.
size
(),
"The number of fixed_sizes and densities must be equal."
);
size_t
num_priors
=
0
;
if
((
fixed_sizes
.
size
()
>
0
)
&&
(
densities
.
size
()
>
0
))
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
}
if
(
!
flatten
)
{
std
::
vector
<
int64_t
>
dim_vec
(
4
);
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
1
]
=
input_dims
[
3
];
...
...
@@ -57,6 +55,11 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
dim_vec
[
3
]
=
4
;
ctx
->
SetOutputDim
(
"Boxes"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Variances"
,
framework
::
make_ddim
(
dim_vec
));
}
else
{
int64_t
dim0
=
input_dims
[
2
]
*
input_dims
[
3
]
*
num_priors
;
ctx
->
SetOutputDim
(
"Boxes"
,
{
dim0
,
4
});
ctx
->
SetOutputDim
(
"Variances"
,
{
dim0
,
4
});
}
}
protected:
...
...
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"Input"
)
->
type
()),
platform
::
CPU
Place
());
ctx
.
Get
Place
());
}
};
...
...
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr
<
bool
>
(
"clip"
,
"(bool) Whether to clip out-of-boundary boxes."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"flatten_to_2d"
,
"(bool) Whether to flatten to 2D and "
"the second dim is 4."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"step_w"
,
"Density prior boxes step across width, 0.0 for auto calculation."
)
...
...
paddle/fluid/operators/detection/density_prior_box_op.cu
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/density_prior_box_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
static
__device__
inline
T
Clip
(
T
in
)
{
return
min
(
max
(
in
,
0.
),
1.
);
}
template
<
typename
T
>
static
__global__
void
GenDensityPriorBox
(
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
T
offset
,
const
T
step_width
,
const
T
step_height
,
const
int
num_priors
,
const
T
*
ratios_shift
,
bool
is_clip
,
const
T
var_xmin
,
const
T
var_ymin
,
const
T
var_xmax
,
const
T
var_ymax
,
T
*
out
,
T
*
var
)
{
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
gidy
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
step_x
=
blockDim
.
x
*
gridDim
.
x
;
int
step_y
=
blockDim
.
y
*
gridDim
.
y
;
const
T
*
width_ratio
=
ratios_shift
;
const
T
*
height_ratio
=
ratios_shift
+
num_priors
;
const
T
*
width_shift
=
ratios_shift
+
2
*
num_priors
;
const
T
*
height_shift
=
ratios_shift
+
3
*
num_priors
;
for
(
int
j
=
gidy
;
j
<
height
;
j
+=
step_y
)
{
for
(
int
i
=
gidx
;
i
<
width
*
num_priors
;
i
+=
step_x
)
{
int
h
=
j
;
int
w
=
i
/
num_priors
;
int
k
=
i
%
num_priors
;
T
center_x
=
(
w
+
offset
)
*
step_width
;
T
center_y
=
(
h
+
offset
)
*
step_height
;
T
center_x_temp
=
center_x
+
width_shift
[
k
];
T
center_y_temp
=
center_y
+
height_shift
[
k
];
T
box_width_ratio
=
width_ratio
[
k
]
/
2.
;
T
box_height_ratio
=
height_ratio
[
k
]
/
2.
;
T
xmin
=
max
((
center_x_temp
-
box_width_ratio
)
/
im_width
,
0.
);
T
ymin
=
max
((
center_y_temp
-
box_height_ratio
)
/
im_height
,
0.
);
T
xmax
=
min
((
center_x_temp
+
box_width_ratio
)
/
im_width
,
1.
);
T
ymax
=
min
((
center_y_temp
+
box_height_ratio
)
/
im_height
,
1.
);
int
out_offset
=
(
j
*
width
*
num_priors
+
i
)
*
4
;
out
[
out_offset
]
=
is_clip
?
Clip
<
T
>
(
xmin
)
:
xmin
;
out
[
out_offset
+
1
]
=
is_clip
?
Clip
<
T
>
(
ymin
)
:
ymin
;
out
[
out_offset
+
2
]
=
is_clip
?
Clip
<
T
>
(
xmax
)
:
xmax
;
out
[
out_offset
+
3
]
=
is_clip
?
Clip
<
T
>
(
ymax
)
:
ymax
;
var
[
out_offset
]
=
var_xmin
;
var
[
out_offset
+
1
]
=
var_ymin
;
var
[
out_offset
+
2
]
=
var_xmax
;
var
[
out_offset
+
3
]
=
var_ymax
;
}
}
}
template
<
typename
T
>
class
DensityPriorBoxOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Input"
);
auto
*
image
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Image"
);
auto
*
boxes
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Boxes"
);
auto
*
vars
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Variances"
);
auto
variances
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"variances"
);
auto
is_clip
=
ctx
.
Attr
<
bool
>
(
"clip"
);
auto
fixed_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"densities"
);
T
step_w
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_w"
));
T
step_h
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_h"
));
T
offset
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"offset"
));
auto
img_width
=
image
->
dims
()[
3
];
auto
img_height
=
image
->
dims
()[
2
];
auto
feature_width
=
input
->
dims
()[
3
];
auto
feature_height
=
input
->
dims
()[
2
];
T
step_width
,
step_height
;
if
(
step_w
==
0
||
step_h
==
0
)
{
step_width
=
static_cast
<
T
>
(
img_width
)
/
feature_width
;
step_height
=
static_cast
<
T
>
(
img_height
)
/
feature_height
;
}
else
{
step_width
=
step_w
;
step_height
=
step_h
;
}
int
num_priors
=
0
;
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
framework
::
Tensor
h_temp
;
T
*
tdata
=
h_temp
.
mutable_data
<
T
>
({
num_priors
*
4
},
platform
::
CPUPlace
());
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
fixed_sizes
.
size
();
++
s
)
{
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
float
box_width_ratio
=
fixed_size
*
sqrt
(
ar
);
float
box_height_ratio
=
fixed_size
/
sqrt
(
ar
);
for
(
int
di
=
0
;
di
<
density
;
++
di
)
{
for
(
int
dj
=
0
;
dj
<
density
;
++
dj
)
{
float
center_x_temp
=
shift
/
2.
+
dj
*
shift
-
step_average
/
2.
;
float
center_y_temp
=
shift
/
2.
+
di
*
shift
-
step_average
/
2.
;
tdata
[
idx
]
=
box_width_ratio
;
tdata
[
num_priors
+
idx
]
=
box_height_ratio
;
tdata
[
2
*
num_priors
+
idx
]
=
center_x_temp
;
tdata
[
3
*
num_priors
+
idx
]
=
center_y_temp
;
idx
++
;
}
}
}
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
Tensor
d_temp
;
framework
::
TensorCopySync
(
h_temp
,
ctx
.
GetPlace
(),
&
d_temp
);
// At least use 32 threads, at most 512 threads.
// blockx is multiple of 32.
int
blockx
=
std
::
min
(((
feature_width
*
num_priors
+
31
)
>>
5
)
<<
5
,
512L
);
int
gridx
=
(
feature_width
*
num_priors
+
blockx
-
1
)
/
blockx
;
dim3
threads
(
blockx
,
1
);
dim3
grids
(
gridx
,
feature_height
);
auto
stream
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>().
stream
();
GenDensityPriorBox
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
feature_height
,
feature_width
,
img_height
,
img_width
,
offset
,
step_width
,
step_height
,
num_priors
,
d_temp
.
data
<
T
>
(),
is_clip
,
variances
[
0
],
variances
[
1
],
variances
[
2
],
variances
[
3
],
boxes
->
data
<
T
>
(),
vars
->
data
<
T
>
());
}
};
// namespace operators
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
density_prior_box
,
ops
::
DensityPriorBoxOpCUDAKernel
<
float
>
,
ops
::
DensityPriorBoxOpCUDAKernel
<
double
>
);
paddle/fluid/operators/detection/density_prior_box_op.h
浏览文件 @
b5c44fd4
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
...
...
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
step_height
=
step_h
;
}
int
num_priors
=
0
;
if
(
fixed_sizes
.
size
()
>
0
&&
densities
.
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
auto
box_dim
=
vars
->
dims
();
boxes
->
Resize
({
feature_height
,
feature_width
,
num_priors
,
4
});
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
...
...
@@ -76,7 +74,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
// Generate density prior boxes with fixed ratios.
if
(
fixed_ratios
.
size
()
>
0
)
{
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
...
...
@@ -111,7 +108,6 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
}
}
}
}
if
(
clip
)
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
ClipFunctor
<
T
>
clip_func
;
...
...
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
e_vars
=
var_et
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
box_num
,
1
));
vars
->
Resize
(
var_dim
);
boxes
->
Resize
(
box_dim
);
}
};
// namespace operators
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
b5c44fd4
...
...
@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
// Channel configurations:
grpc
::
ChannelArguments
args
;
args
.
SetInt
(
GRPC_ARG_MAX_RECONNECT_BACKOFF_MS
,
2000
);
if
(
FLAGS_rpc_disable_reuse_port
)
{
args
.
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
b5c44fd4
...
...
@@ -20,6 +20,8 @@ limitations under the License. */
using
::
grpc
::
ServerAsyncResponseWriter
;
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
VLOG
(
40
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
// Define an option subclass in order to disable SO_REUSEPORT for the
// server socket.
// Come from:
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
class
NoReusePortOption
:
public
::
grpc
::
ServerBuilderOption
{
public:
void
UpdateArguments
(
::
grpc
::
ChannelArguments
*
args
)
override
{
args
->
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
void
UpdatePlugins
(
std
::
vector
<
std
::
unique_ptr
<::
grpc
::
ServerBuilderPlugin
>>*
plugins
)
override
{}
};
void
AsyncGRPCServer
::
StartServer
()
{
::
grpc
::
ServerBuilder
builder
;
builder
.
AddListeningPort
(
bind_address_
,
::
grpc
::
InsecureServerCredentials
(),
...
...
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
builder
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
builder
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
if
(
FLAGS_rpc_disable_reuse_port
)
{
builder
.
SetOption
(
std
::
unique_ptr
<::
grpc
::
ServerBuilderOption
>
(
new
NoReusePortOption
));
}
builder
.
RegisterService
(
&
service_
);
for
(
auto
t
:
rpc_call_map_
)
{
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
b5c44fd4
...
...
@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
DEFINE_bool
(
rpc_disable_reuse_port
,
false
,
"Disable SO_REUSEPORT or not."
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <mkldnn/include/mkldnn.hpp>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "xbyak.h"
#include "xbyak_util.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
DataLayout
;
using
mkldnn
::
memory
;
static
mkldnn
::
memory
::
format
StringToMKLDNNFormat
(
std
::
string
&
format
)
{
std
::
transform
(
format
.
begin
(),
format
.
end
(),
format
.
begin
(),
::
tolower
);
if
(
!
format
.
compare
(
"nchw"
))
{
return
memory
::
format
::
nchw
;
}
else
if
(
!
format
.
compare
(
"nchw16c"
))
{
return
memory
::
format
::
nChw16c
;
}
else
if
(
!
format
.
compare
(
"nchw8c"
))
{
return
memory
::
format
::
nChw8c
;
}
else
if
(
!
format
.
compare
(
"nhwc"
))
{
return
memory
::
format
::
nhwc
;
}
else
{
return
memory
::
format
::
any
;
}
}
static
void
UpdateDataFormat
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
tensor
,
const
char
*
attribute
)
{
if
(
ctx
.
op
().
HasAttr
(
attribute
))
{
auto
format_as_string
=
ctx
.
Attr
<
std
::
string
>
(
attribute
);
auto
format
=
StringToMKLDNNFormat
(
format_as_string
);
if
(
format
!=
memory
::
format
::
any
)
{
tensor
->
set_format
(
format
);
}
}
}
template
<
typename
T
>
static
void
ReorderInput
(
framework
::
Tensor
*
tensor
,
const
platform
::
Place
&
place
,
const
mkldnn
::
engine
&
engine
,
bool
isFourDim
)
{
using
platform
::
to_void_cast
;
auto
dims
=
paddle
::
framework
::
vectorize2int
(
tensor
->
dims
());
framework
::
Tensor
out_tensor
;
out_tensor
.
Resize
(
tensor
->
dims
());
out_tensor
.
set_format
(
isFourDim
?
memory
::
format
::
nchw
:
memory
::
format
::
nc
);
out_tensor
.
set_layout
(
tensor
->
layout
());
mkldnn
::
memory
input_memory
=
{
{{
dims
,
platform
::
MKLDNNGetDataType
<
T
>
(),
tensor
->
format
()},
engine
},
to_void_cast
<
T
>
(
tensor
->
data
<
T
>
())};
mkldnn
::
memory
output_memory
=
{
{{
dims
,
platform
::
MKLDNNGetDataType
<
T
>
(),
out_tensor
.
format
()},
engine
},
to_void_cast
<
T
>
(
out_tensor
.
mutable_data
<
T
>
(
place
))};
platform
::
Reorder
(
input_memory
,
output_memory
);
tensor
->
ShareDataWith
(
out_tensor
);
}
template
<
typename
T
>
class
ElementwiseMulMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
y_data
=
y
->
data
<
T
>
();
T
*
z_data
=
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
x_dims
=
x
->
dims
();
auto
y_dims_untrimmed
=
y
->
dims
();
auto
x_int_dims
=
paddle
::
framework
::
vectorize2int
(
x_dims
);
UpdateDataFormat
(
ctx
,
(
Tensor
*
)
x
,
"x_data_format"
);
UpdateDataFormat
(
ctx
,
(
Tensor
*
)
y
,
"y_data_format"
);
Xbyak
::
util
::
Cpu
cpu
;
const
bool
is_avx512_enabled
=
cpu
.
has
(
Xbyak
::
util
::
Cpu
::
tAVX512F
);
const
bool
are_dims_divisable
=
!
(
x_int_dims
[
1
]
%
16
);
const
bool
is_x_format_correct
=
x
->
format
()
==
memory
::
format
::
nChw16c
;
const
bool
is_y_format_correct
=
y
->
format
()
==
memory
::
format
::
nc
;
if
(
is_x_format_correct
&&
is_y_format_correct
&&
are_dims_divisable
&&
is_avx512_enabled
)
{
int
pre
,
n
,
post
;
get_mid_dims
(
x_dims
,
y_dims_untrimmed
,
axis
,
&
pre
,
&
n
,
&
post
);
if
(
post
==
1
)
{
PADDLE_THROW
(
"Not implemented when post is 1"
);
}
else
{
// Just check whether it works for RE-Resnext.
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
4
,
"X should have 4 dimensions"
);
int
n
=
x_dims
[
0
];
int
c
=
x_dims
[
1
];
int
h
=
x_dims
[
2
];
int
w
=
x_dims
[
3
];
PADDLE_ENFORCE
(
y_dims_untrimmed
[
0
]
==
n
&&
y_dims_untrimmed
[
1
]
==
c
,
"Y should be in nc format"
);
constexpr
int
simd_width
=
16
;
int
C
=
c
/
simd_width
;
const
auto
&
multiply
=
math
::
jitkernel
::
KernelPool
::
Instance
()
.
template
Get
<
math
::
jitkernel
::
EltwiseMulnChw16cNCKernel
<
T
>
>
(
n
);
#pragma omp parallel for collapse(2)
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
for
(
int
ci
=
0
;
ci
<
C
;
ci
++
)
{
auto
ptr_x
=
x_data
+
ni
*
C
*
h
*
w
*
simd_width
+
ci
*
h
*
w
*
simd_width
;
auto
ptr_y
=
y_data
+
ni
*
C
*
simd_width
+
ci
*
simd_width
;
auto
ptr_z
=
z_data
+
ni
*
C
*
h
*
w
*
simd_width
+
ci
*
h
*
w
*
simd_width
;
multiply
->
Compute
(
ptr_x
,
ptr_y
,
ptr_z
,
h
,
w
);
}
}
}
z
->
set_layout
(
DataLayout
::
kMKLDNN
);
z
->
set_format
(
x
->
format
());
}
else
{
// Fallback to naive version:
const
bool
are_inputs_in_same_format
=
x
->
format
()
==
y
->
format
();
const
bool
is_x_nchw
=
x
->
format
()
==
memory
::
format
::
nchw
;
const
bool
is_x_nc
=
x
->
format
()
==
memory
::
format
::
nc
;
const
bool
is_y_nchw
=
y
->
format
()
==
memory
::
format
::
nchw
;
const
bool
is_y_nc
=
y
->
format
()
==
memory
::
format
::
nc
;
if
(
!
are_inputs_in_same_format
)
{
using
platform
::
MKLDNNDeviceContext
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
if
(
!
(
is_x_nchw
||
is_x_nc
))
ReorderInput
<
T
>
((
Tensor
*
)
x
,
ctx
.
GetPlace
(),
mkldnn_engine
,
x
->
dims
().
size
()
==
4
);
if
(
!
(
is_y_nchw
||
is_y_nc
))
ReorderInput
<
T
>
((
Tensor
*
)
y
,
ctx
.
GetPlace
(),
mkldnn_engine
,
y
->
dims
().
size
()
==
4
);
}
auto
mul_func
=
[](
T
a
,
T
b
)
->
T
{
return
a
*
b
;
};
TransformFunctor
<
decltype
(
mul_func
),
T
,
paddle
::
platform
::
CPUDeviceContext
,
T
>
functor
(
x
,
y
,
z
,
ctx
.
template
device_context
<
paddle
::
platform
::
CPUDeviceContext
>(),
mul_func
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims_untrimmed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimmed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
);
if
(
post
==
1
)
{
functor
.
RunRowWise
(
n
,
pre
);
}
else
{
functor
.
RunMidWise
(
n
,
pre
,
post
);
}
z
->
set_layout
(
DataLayout
::
kMKLDNN
);
z
->
set_format
(
x
->
format
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
elementwise_mul
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ElementwiseMulMKLDNNKernel
<
float
>
)
paddle/fluid/operators/elementwise/elementwise_op.h
浏览文件 @
b5c44fd4
...
...
@@ -97,6 +97,20 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
.
EqualGreaterThan
(
-
1
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false). Used by MKLDNN."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"x_data_format"
,
"(string, default NCHW) Only used in mkldnn"
"An optional string from:
\"
NHWC
\"
,
\"
NCHW
\"
,
\"
NCHW16C
\"
,
\"
NCHW8C
\"
. "
"Defaults to
\"\"
. Specify the data format of the output data, "
"the input will be transformed automatically. "
)
.
SetDefault
(
""
);
AddAttr
<
std
::
string
>
(
"y_data_format"
,
"(string, default
\"\"
) Only used in mkldnn"
"An optional string from:
\"
NHWC
\"
,
\"
NCHW
\"
,
\"
NCHW16C
\"
,
\"
NCHW8C
\"
. "
"Defaults to
\"\"
. Specify the data format of the output data, "
"the input will be transformed automatically. "
)
.
SetDefault
(
""
);
AddComment
(
string
::
Sprintf
(
R"DOC(
Elementwise %s Operator
...
...
paddle/fluid/operators/group_norm_op.cc
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/group_norm_op.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
DataLayout
=
framework
::
DataLayout
;
class
GroupNormOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
"Output(Y) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Mean"
),
"Output(Mean) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Variance"
),
"Output(Variance) of GroupNormOp should not be null."
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
channel_num
=
x_dim
[
1
];
auto
batch_size
=
x_dim
[
0
];
auto
groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"groups"
);
PADDLE_ENFORCE_LE
(
groups
,
channel_num
,
"'groups' must be less equal than the number of channels."
);
PADDLE_ENFORCE_GE
(
groups
,
1
,
"'groups' must be greater equal than 1."
);
if
(
ctx
->
HasInput
(
"Scale"
))
{
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
channel_num
);
}
if
(
ctx
->
HasInput
(
"Bias"
))
{
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
channel_num
);
}
ctx
->
SetOutputDim
(
"Y"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
"Mean"
,
{
batch_size
,
groups
});
ctx
->
SetOutputDim
(
"Variance"
,
{
batch_size
,
groups
});
ctx
->
ShareLoD
(
"X"
,
"Y"
);
}
};
class
GroupNormOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input tensor."
);
AddInput
(
"Scale"
,
"Scale is a 1-dimensional tensor of size C"
"that is applied to the output."
)
.
AsDispensable
();
AddInput
(
"Bias"
,
"Bias is a 1-dimensional tensor of size C "
"that is applied to the output"
)
.
AsDispensable
();
AddOutput
(
"Y"
,
"Result after normalization."
);
AddOutput
(
"Mean"
,
"Mean of each group."
).
AsIntermediate
();
AddOutput
(
"Variance"
,
"Variance of each group."
).
AsIntermediate
();
AddAttr
<
float
>
(
"epsilon"
,
"Constant for numerical stability [default 1e-5]."
)
.
SetDefault
(
1e-5
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE
(
epsilon
>=
0.0
f
&&
epsilon
<=
1.0
f
,
"'epsilon' should be between 0.0 and 1.0."
);
});
AddAttr
<
int
>
(
"groups"
,
"The number of groups that divided from channels."
)
.
AddCustomChecker
([](
const
int
&
groups
)
{
PADDLE_ENFORCE_GT
(
groups
,
0
,
"'groups' should be greater than zero."
);
});
AddComment
(
R"DOC(
Group Normalization
Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
)DOC"
);
}
};
class
GroupNormGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
// check input
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Mean"
),
"Input(Mean) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variance"
),
"Input(Variance) of GroupNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
"Input(Y@GRAD) of GroupNormOp should not be null."
);
// check output
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
ctx
->
GetInputDim
(
"Scale"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
ctx
->
GetInputDim
(
"Bias"
));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
var
=
ctx
.
InputVar
(
framework
::
GradVarName
(
"Y"
));
if
(
var
==
nullptr
)
{
PADDLE_THROW
(
"can't find Y@GRAD"
);
}
const
Tensor
*
t
=
nullptr
;
if
(
var
->
IsType
<
Tensor
>
())
{
t
=
&
var
->
Get
<
Tensor
>
();
}
else
if
(
var
->
IsType
<
LoDTensor
>
())
{
t
=
&
var
->
Get
<
LoDTensor
>
();
}
if
(
t
==
nullptr
)
{
PADDLE_THROW
(
"can't find Y@GRAD"
);
}
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
t
->
type
()),
ctx
.
GetPlace
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
group_norm
,
ops
::
GroupNormOp
,
ops
::
GroupNormOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
group_norm_grad
,
ops
::
GroupNormGradOp
);
REGISTER_OP_CPU_KERNEL
(
group_norm
,
ops
::
GroupNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GroupNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
group_norm_grad
,
ops
::
GroupNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GroupNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/group_norm_op.cu
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cub/cub.cuh>
#include "paddle/fluid/operators/group_norm_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
GroupNormForwardGetMeanAndVar
(
const
T
*
x
,
int
N
,
int
C
,
int
imsize
,
int
groups
,
int
group_size
,
T
*
mean
,
T
*
var
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_mean
=
0
,
x_var
=
0
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
val
=
x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
];
x_mean
+=
val
;
x_var
+=
val
*
val
;
}
x_mean
/=
number
*
imsize
;
x_var
/=
number
*
imsize
;
__shared__
T
s_mem
[
2
];
if
(
threadIdx
.
x
==
0
)
{
s_mem
[
0
]
=
s_mem
[
1
]
=
0
;
}
__syncthreads
();
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
0
],
x_mean
);
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
1
],
x_var
);
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
paddle
::
platform
::
CudaAtomicAdd
(
&
mean
[
bid
*
groups
+
gid
],
s_mem
[
0
]);
paddle
::
platform
::
CudaAtomicAdd
(
&
var
[
bid
*
groups
+
gid
],
s_mem
[
1
]);
}
}
template
<
typename
T
>
__global__
void
GroupNormForward
(
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
const
T
*
bias
,
int
N
,
int
C
,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
T
*
y
,
T
*
real_var
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_mean
=
mean
[
bid
*
groups
+
gid
];
T
x_var
=
var
[
bid
*
groups
+
gid
];
x_var
=
x_var
-
x_mean
*
x_mean
;
T
var_inv
=
1.0
/
sqrt
(
x_var
+
epsilon
);
if
(
cid
==
0
&&
threadIdx
.
x
==
0
)
real_var
[
bid
*
groups
+
gid
]
=
x_var
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
val
=
x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
];
val
=
(
val
-
x_mean
)
*
var_inv
;
if
(
scale
)
val
*=
scale
[
gid
*
group_size
+
cid
];
if
(
bias
)
val
+=
bias
[
gid
*
group_size
+
cid
];
y
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
]
=
val
;
}
}
template
<
typename
T
>
class
GroupNormKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
const
auto
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
auto
x_dims
=
x
->
dims
();
const
int
group_size
=
(
x_dims
[
1
]
-
1
)
/
groups
+
1
;
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
var
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
Tensor
temp_var
;
temp_var
.
mutable_data
<
T
>
(
var
->
dims
(),
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
mean
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
&
temp_var
,
static_cast
<
T
>
(
0
));
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
auto
*
temp_var_data
=
temp_var
.
data
<
T
>
();
const
T
*
scale_data
=
nullptr
;
if
(
scale
)
scale_data
=
scale
->
data
<
T
>
();
const
T
*
bias_data
=
nullptr
;
if
(
bias
)
bias_data
=
bias
->
data
<
T
>
();
int
imsize
=
x_dims
[
2
]
*
x_dims
[
3
];
int
block_size
=
std
::
min
(
512
,
imsize
);
dim3
grid
(
group_size
,
groups
,
x_dims
[
0
]);
dim3
threads
(
block_size
,
1
,
1
);
GroupNormForwardGetMeanAndVar
<
T
><<<
grid
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
x_dims
[
0
],
x_dims
[
1
],
imsize
,
groups
,
group_size
,
mean_data
,
temp_var_data
);
GroupNormForward
<
T
><<<
grid
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
mean_data
,
temp_var_data
,
scale_data
,
bias_data
,
x_dims
[
0
],
x_dims
[
1
],
imsize
,
groups
,
group_size
,
epsilon
,
y_data
,
var_data
);
}
};
template
<
typename
T
>
__global__
void
GroupNormBackwardGetMeanAndVar
(
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
const
T
*
d_y
,
int
N
,
int
C
,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
T
*
d_x
,
T
*
d_mean
,
T
*
d_var
,
T
*
d_scale
,
T
*
d_bias
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_mean
=
mean
[
bid
*
groups
+
gid
];
T
x_var
=
var
[
bid
*
groups
+
gid
];
T
var_inv
=
1.0
/
sqrt
(
x_var
+
epsilon
);
T
d_var_inv
=
0
,
d_x_mean
=
0
;
T
d_mean_data
=
0
,
d_var_data
=
0
,
d_scale_data
=
0
,
d_bias_data
=
0
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
tmp
=
x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
];
T
val
=
(
tmp
-
x_mean
)
*
var_inv
;
T
dval
=
d_y
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
];
if
(
d_bias
)
d_bias_data
+=
dval
;
if
(
d_scale
)
d_scale_data
+=
val
*
dval
;
if
(
scale
)
dval
=
dval
*
scale
[
ccid
];
d_var_data
+=
(
tmp
-
x_mean
)
*
dval
;
T
d_tmp
=
dval
*
var_inv
;
if
(
d_x
)
d_x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
]
=
d_tmp
;
d_mean_data
-=
d_tmp
;
}
__shared__
T
s_mem
[
4
];
if
(
threadIdx
.
x
==
0
)
{
s_mem
[
0
]
=
s_mem
[
1
]
=
0
;
if
(
d_scale
)
s_mem
[
2
]
=
0
;
if
(
d_bias
)
s_mem
[
3
]
=
0
;
}
__syncthreads
();
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
0
],
d_mean_data
);
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
1
],
d_var_data
);
if
(
d_scale
)
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
2
],
d_scale_data
);
if
(
d_bias
)
paddle
::
platform
::
CudaAtomicAdd
(
&
s_mem
[
3
],
d_bias_data
);
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
paddle
::
platform
::
CudaAtomicAdd
(
&
d_mean
[
bid
*
groups
+
gid
],
s_mem
[
0
]);
paddle
::
platform
::
CudaAtomicAdd
(
&
d_var
[
bid
*
groups
+
gid
],
s_mem
[
1
]);
if
(
d_scale
)
paddle
::
platform
::
CudaAtomicAdd
(
&
d_scale
[
ccid
],
s_mem
[
2
]);
if
(
d_bias
)
paddle
::
platform
::
CudaAtomicAdd
(
&
d_bias
[
ccid
],
s_mem
[
3
]);
}
}
template
<
typename
T
>
__global__
void
GroupNormBackward
(
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
d_mean
,
const
T
*
d_var
,
int
N
,
int
C
,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
T
*
d_x
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_mean
=
mean
[
bid
*
groups
+
gid
];
T
x_var
=
var
[
bid
*
groups
+
gid
];
T
d_x_mean
=
d_mean
[
bid
*
groups
+
gid
];
T
d_var_inv
=
d_var
[
bid
*
groups
+
gid
];
T
d_x_var
=
-
1.0
/
(
2
*
(
x_var
+
epsilon
)
*
sqrt
(
x_var
+
epsilon
))
*
d_var_inv
;
d_x_mean
-=
2
*
d_x_var
*
x_mean
;
d_x_var
/=
number
*
imsize
;
d_x_mean
/=
number
*
imsize
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
tmp
=
x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
];
if
(
d_x
)
d_x
[(
bid
*
C
+
ccid
)
*
imsize
+
imid
]
+=
d_x_mean
+
tmp
*
2
*
d_x_var
;
}
}
template
<
typename
T
>
class
GroupNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
const
auto
&
x_dims
=
x
->
dims
();
const
int
group_size
=
(
x_dims
[
1
]
-
1
)
/
groups
+
1
;
T
*
d_x_data
=
nullptr
;
if
(
d_x
)
{
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_x_data
=
d_x
->
data
<
T
>
();
}
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
Tensor
temp_var
;
temp_var
.
mutable_data
<
T
>
(
var
->
dims
(),
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
&
temp_var
,
static_cast
<
T
>
(
0
));
T
*
temp_var_data
=
temp_var
.
data
<
T
>
();
Tensor
temp_mean
;
temp_mean
.
mutable_data
<
T
>
(
var
->
dims
(),
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
&
temp_mean
,
static_cast
<
T
>
(
0
));
T
*
temp_mean_data
=
temp_mean
.
data
<
T
>
();
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
d_y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
T
*
d_scale_data
=
nullptr
;
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
d_scale_data
=
d_scale
->
data
<
T
>
();
}
T
*
d_bias_data
=
nullptr
;
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
d_bias_data
=
d_bias
->
data
<
T
>
();
}
const
T
*
scale_data
=
nullptr
;
if
(
scale
)
scale_data
=
scale
->
data
<
T
>
();
int
imsize
=
x_dims
[
2
]
*
x_dims
[
3
];
int
block_size
=
std
::
min
(
512
,
imsize
);
dim3
grid
(
group_size
,
groups
,
x_dims
[
0
]);
dim3
threads
(
block_size
,
1
,
1
);
GroupNormBackwardGetMeanAndVar
<
T
><<<
grid
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
mean_data
,
var_data
,
scale_data
,
y_data
,
x_dims
[
0
],
x_dims
[
1
],
imsize
,
groups
,
group_size
,
epsilon
,
d_x_data
,
temp_mean_data
,
temp_var_data
,
d_scale_data
,
d_bias_data
);
GroupNormBackward
<
T
><<<
grid
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
mean_data
,
var_data
,
temp_mean_data
,
temp_var_data
,
x_dims
[
0
],
x_dims
[
1
],
imsize
,
groups
,
group_size
,
epsilon
,
d_x_data
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
group_norm
,
ops
::
GroupNormKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GroupNormKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
group_norm_grad
,
ops
::
GroupNormGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GroupNormGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/group_norm_op.h
0 → 100644
浏览文件 @
b5c44fd4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
DataLayout
=
framework
::
DataLayout
;
template
<
typename
DeviceContext
,
typename
T
>
class
GroupNormKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
const
auto
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
auto
x_dims
=
x
->
dims
();
const
int
group_size
=
(
x_dims
[
1
]
-
1
)
/
groups
+
1
;
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
var
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
const
T
*
scale_data
=
nullptr
;
if
(
scale
)
scale_data
=
scale
->
data
<
T
>
();
const
T
*
bias_data
=
nullptr
;
if
(
bias
)
bias_data
=
bias
->
data
<
T
>
();
int
imsize
=
x_dims
[
2
]
*
x_dims
[
3
];
auto
*
iter_x_data
=
x_data
;
auto
*
iter_y_data
=
y_data
;
for
(
int
bid
=
0
;
bid
<
x_dims
[
0
];
bid
++
)
for
(
int
gid
=
0
;
gid
<
groups
;
gid
++
)
{
T
x_mean
=
0
,
x_var
=
0
;
int
number
=
std
::
min
(
group_size
,
static_cast
<
int
>
(
x_dims
[
1
]
-
gid
*
group_size
));
auto
*
tmp
=
iter_x_data
;
for
(
int
cid
=
0
;
cid
<
number
;
cid
++
)
{
for
(
int
imid
=
0
;
imid
<
imsize
;
imid
++
,
iter_x_data
++
)
{
x_mean
+=
iter_x_data
[
0
];
x_var
+=
iter_x_data
[
0
]
*
iter_x_data
[
0
];
}
}
x_mean
/=
number
*
imsize
;
x_var
/=
number
*
imsize
;
x_var
=
x_var
-
x_mean
*
x_mean
;
T
var_inv
=
1.0
/
sqrt
(
x_var
+
epsilon
);
mean_data
[
bid
*
groups
+
gid
]
=
x_mean
;
var_data
[
bid
*
groups
+
gid
]
=
x_var
;
for
(
int
cid
=
0
;
cid
<
number
;
cid
++
)
{
for
(
int
imid
=
0
;
imid
<
imsize
;
imid
++
,
tmp
++
,
iter_y_data
++
)
{
T
val
=
(
tmp
[
0
]
-
x_mean
)
*
var_inv
;
if
(
scale_data
)
val
*=
scale_data
[
gid
*
group_size
+
cid
];
if
(
bias_data
)
val
+=
bias_data
[
gid
*
group_size
+
cid
];
iter_y_data
[
0
]
=
val
;
}
}
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
GroupNormGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
const
auto
&
x_dims
=
x
->
dims
();
const
int
group_size
=
(
x_dims
[
1
]
-
1
)
/
groups
+
1
;
// TODO(liangdun): need to check d_x is null
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
T
*
d_x_data
=
nullptr
;
if
(
d_x
)
{
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
d_x
,
static_cast
<
T
>
(
0
));
d_x_data
=
d_x
->
data
<
T
>
();
}
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
d_y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
T
*
d_scale_data
=
nullptr
;
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
d_scale_data
=
d_scale
->
data
<
T
>
();
}
T
*
d_bias_data
=
nullptr
;
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
d_bias_data
=
d_bias
->
data
<
T
>
();
}
const
T
*
scale_data
=
nullptr
;
if
(
scale
)
scale_data
=
scale
->
data
<
T
>
();
int
imsize
=
x_dims
[
2
]
*
x_dims
[
3
];
auto
*
iter_x_data
=
x_data
;
auto
*
iter_d_x_data
=
d_x_data
;
auto
*
iter_y_data
=
y_data
;
for
(
int
bid
=
0
;
bid
<
x_dims
[
0
];
bid
++
)
for
(
int
gid
=
0
;
gid
<
groups
;
gid
++
)
{
T
x_mean
=
mean_data
[
bid
*
groups
+
gid
];
T
x_var
=
var_data
[
bid
*
groups
+
gid
];
T
var_inv
=
1.0
/
sqrt
(
x_var
+
epsilon
);
int
number
=
std
::
min
(
group_size
,
static_cast
<
int
>
(
x_dims
[
1
]
-
gid
*
group_size
));
auto
*
tmp
=
iter_x_data
;
auto
*
tmp2
=
iter_d_x_data
;
T
d_var_inv
=
0
,
d_x_mean
=
0
;
for
(
int
cid
=
0
;
cid
<
number
;
cid
++
)
{
for
(
int
imid
=
0
;
imid
<
imsize
;
imid
++
,
tmp
++
,
iter_y_data
++
,
iter_d_x_data
++
)
{
T
val
=
(
tmp
[
0
]
-
x_mean
)
*
var_inv
;
T
dval
=
iter_y_data
[
0
];
if
(
d_bias_data
)
d_bias_data
[
gid
*
group_size
+
cid
]
+=
dval
;
if
(
d_scale_data
)
d_scale_data
[
gid
*
group_size
+
cid
]
+=
val
*
dval
;
if
(
scale_data
)
dval
=
scale_data
[
gid
*
group_size
+
cid
]
*
dval
;
d_var_inv
+=
(
tmp
[
0
]
-
x_mean
)
*
dval
;
T
d_tmp
=
dval
*
var_inv
;
if
(
d_x_data
)
iter_d_x_data
[
0
]
+=
d_tmp
;
d_x_mean
-=
d_tmp
;
}
}
T
d_x_var
=
-
1.0
/
(
2
*
(
x_var
+
epsilon
)
*
sqrt
(
x_var
+
epsilon
))
*
d_var_inv
;
d_x_mean
-=
2
*
d_x_var
*
x_mean
;
d_x_var
/=
number
*
imsize
;
d_x_mean
/=
number
*
imsize
;
iter_d_x_data
=
tmp2
;
if
(
d_x_data
)
{
for
(
int
cid
=
0
;
cid
<
number
;
cid
++
)
{
for
(
int
imid
=
0
;
imid
<
imsize
;
imid
++
,
iter_x_data
++
,
iter_d_x_data
++
)
{
iter_d_x_data
[
0
]
+=
d_x_mean
;
iter_d_x_data
[
0
]
+=
iter_x_data
[
0
]
*
2
*
d_x_var
;
}
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
b5c44fd4
...
...
@@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
*
out_grad
);
Eigen
::
array
<
int
,
2
>
bcast
({{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}})
;
Eigen
::
array
<
int
,
2
>
bcast
{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}
;
// softrelu derivative
pre_out_grad_mat
.
device
(
place
)
=
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
b5c44fd4
if
(
NOT WIN32
)
add_subdirectory
(
detail
)
endif
(
NOT WIN32
)
add_subdirectory
(
detail
)
function
(
math_library TARGET
)
# math_library is a function to create math library.
...
...
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
math_library
(
im2col
)
math_library
(
sampler
)
if
(
NOT WIN32
)
# windows do not support avx functions yet.
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
lstm_compute DEPS activation_functions
)
endif
(
NOT WIN32
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
lstm_compute DEPS activation_functions
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto device_context
)
math_library
(
math_function DEPS blas
)
...
...
@@ -58,9 +54,9 @@ math_library(sequence_padding)
math_library
(
sequence_pooling DEPS math_function
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function
)
if
(
NOT WIN32
)
math_library
(
matrix_bit_code
)
endif
(
NOT WIN32
)
math_library
(
matrix_bit_code
)
math_library
(
unpooling
)
math_library
(
vol2col
)
...
...
@@ -76,13 +72,12 @@ if(WITH_GPU)
endif
()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_and_split
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
if
(
NOT WIN32
)
set
(
JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc
)
set
(
JIT_KERNEL_DEPS cpu_info cblas gflags enforce
)
if
(
WITH_XBYAK
)
set
(
JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc
)
set
(
JIT_KERNEL_DEPS cpu_info cblas gflags enforce
)
if
(
WITH_XBYAK
)
list
(
APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc
)
list
(
APPEND JIT_KERNEL_DEPS xbyak
)
endif
()
cc_library
(
jit_kernel SRCS
${
JIT_KERNEL_SRCS
}
DEPS
${
JIT_KERNEL_DEPS
}
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
endif
(
NOT WIN32
)
endif
()
cc_library
(
jit_kernel SRCS
${
JIT_KERNEL_SRCS
}
DEPS
${
JIT_KERNEL_DEPS
}
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
b5c44fd4
...
...
@@ -16,6 +16,9 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/gpu_info.h"
DECLARE_bool
(
enable_cublas_tensor_op_math
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -42,11 +45,44 @@ struct CUBlas<float> {
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
static
void
GEMM_
STRIDED_
BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"SgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
platform
::
TensorCoreAvailable
()
?
"True"
:
"False"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmEx
(
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
#else
PADDLE_THROW
(
"cublasSgemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
}
};
...
...
@@ -69,13 +105,18 @@ struct CUBlas<double> {
}
template
<
typename
...
ARGS
>
static
void
GEMM_BATCH
(
ARGS
...
args
)
{
static
void
GEMM_
STRIDED_
BATCH
(
ARGS
...
args
)
{
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasDgemmStridedBatched
(
args
...));
#else
PADDLE_THROW
(
"DgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
ARGS
...
args
)
{
PADDLE_THROW
(
"Currently there are not cublasDgemmEx."
);
}
};
template
<
>
...
...
@@ -96,10 +137,12 @@ struct CUBlas<platform::float16> {
reinterpret_cast
<
__half
*>
(
C
),
ldc
));
}
static
void
GEMM_BATCH
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
static
void
GEMM_STRIDED_BATCH
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float16
*
alpha
,
const
float16
*
A
,
int
lda
,
long
long
int
strideA
,
const
float16
*
B
,
// NOLINT
const
float16
*
alpha
,
const
float16
*
A
,
int
lda
,
long
long
int
strideA
,
// NOLINT
const
float16
*
B
,
// NOLINT
int
ldb
,
long
long
int
strideB
,
// NOLINT
const
float16
*
beta
,
float16
*
C
,
int
ldc
,
long
long
int
strideC
,
// NOLINT
...
...
@@ -114,6 +157,45 @@ struct CUBlas<platform::float16> {
ldc
,
strideC
,
batchCount
));
#else
PADDLE_THROW
(
"HgemmStridedBatched is not supported on cuda <= 7.5"
);
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
platform
::
TensorCoreAvailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmEx
(
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
#else
PADDLE_THROW
(
"cublasGemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
}
};
...
...
@@ -133,8 +215,21 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
...
...
@@ -157,30 +252,18 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
"cublas fp16 gemm requires GPU compute capability >= 53"
);
#if CUDA_VERSION >= 8000
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
if
(
context_
.
GetComputeCapability
()
>=
70
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
context_
.
cublas_handle
(),
CUBLAS_TENSOR_OP_MATH
));
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
context_
.
cublas_handle
(),
CUBLAS_DEFAULT_MATH
));
}
#endif // CUDA_VERSION >= 9000
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmEx
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_
32F
,
algo
)
);
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
platform
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_
16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
...
...
@@ -199,8 +282,38 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
);
}
else
{
#endif // CUDA_VERSION >= 8000
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
int
lda
,
const
platform
::
float16
*
B
,
int
ldb
,
platform
::
float16
beta
,
platform
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
}
template
<
>
...
...
@@ -238,9 +351,34 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
CUBlas
<
T
>::
GEMM_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
#if CUDA_VERSION >= 9010
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
cublas_call
=
[
&
]()
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
platform
::
TensorCoreAvailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
strideB
,
A
,
CUDA_R_32F
,
lda
,
strideA
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
,
strideC
,
batchCount
,
CUDA_R_32F
,
algo
));
};
auto
&
dev_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
dev_ctx
.
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
}
else
{
#endif // CUDA_VERSION >= 9010
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
#if CUDA_VERSION >= 9010
}
#endif // CUDA_VERSION >= 9010
}
}
// namespace math
...
...
paddle/fluid/operators/math/detail/activation_functions.h
浏览文件 @
b5c44fd4
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h"
...
...
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
b5c44fd4
...
...
@@ -17,8 +17,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
if (FLAGS_paddle_num_threads > 1)
#pragma omp parallel for
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
...
...
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
b5c44fd4
...
...
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,99 +59,60 @@ void VXXJitCode::generate() {
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
int
rest
=
num_
%
YMM_FLOAT_BLOCK
;
while
(
rest
>
0
)
{
int
block
=
XMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
block
=
4
;
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
block
=
2
;
if
(
scalar_index_
!=
1
)
{
vmovq
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
if
(
scalar_index_
!=
2
)
{
vmovq
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
rest
>=
2
)
{
}
else
{
block
=
1
;
if
(
scalar_index_
!=
1
)
{
vmovup
s
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovs
s
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovup
s
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmovs
s
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
}
switch
(
type_
)
{
case
operand_type
::
mul
:
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
break
;
case
operand_type
::
add
:
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
break
;
default:
break
;
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
if
(
rest
>=
4
)
{
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
vmovq
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
}
if
(
rest
>
0
)
{
if
(
scalar_index_
!=
1
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
}
if
(
scalar_index_
!=
2
)
{
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
}
if
(
type_
==
operand_type
::
mul
)
{
vmulss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
}
else
{
vmovss
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
offset
+=
sizeof
(
float
)
*
block
;
rest
-=
block
;
}
ret
();
}
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
static
const
float
exp_float_consts
[]
ALIGN32
=
{
REPEAT_8TIMES
(
1.
f
),
const
float
exp_float_consts
[]
ALIGN32
=
{
REPEAT_8TIMES
(
1.
f
),
REPEAT_8TIMES
(
2.
f
),
REPEAT_8TIMES
(
0.5
f
),
REPEAT_8TIMES
(
EXP_HIG
),
...
...
@@ -170,147 +130,12 @@ static const float exp_float_consts[] ALIGN32 = {
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MAX
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MIN
)};
static
const
int
exp_int_0x7f
[]
ALIGN32
=
{
REPEAT_8TIMES
(
0x7f
)};
static
int
g_tmp_mem
[
16
]
ALIGN32
=
{
0
};
const
int
exp_int_0x7f
[]
ALIGN32
=
{
REPEAT_8TIMES
(
0x7f
)};
int
g_tmp_mem
[
16
]
ALIGN32
=
{
0
};
bool
VActJitCode
::
init
(
int
d
,
operand_type
type
)
{
bool
ok
=
MayIUse
(
avx
);
if
(
type
==
operand_type
::
relu
)
{
return
ok
;
}
else
if
(
type
==
operand_type
::
exp
)
{
// exp is slower than mkl when d >= 256
return
ok
&&
d
%
8
==
0
&&
d
<
256
;
}
else
{
// TODO(TJ): support more
return
ok
&&
d
%
8
==
0
;
}
}
void
VActJitCode
::
relu_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
ymm_t
&
ymm_zero
)
{
vmaxps
(
ymm_dst
,
ymm_zero
,
ymm_src
);
}
void
VActJitCode
::
exp_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
assert
(
ymm_src
.
getIdx
()
!=
ymm_dst
.
getIdx
());
// TODO(TJ): use enfore
// check all idx can not equal
ymm_t
ymm_fx
=
ymm_t
(
fx_idx
);
ymm_t
ymm_fy
=
ymm_t
(
fy_idx
);
ymm_t
ymm_mask
=
ymm_t
(
mask_idx
);
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vminps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmaxps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
// express exp(x) as exp(g + n*log(2))
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmulps
(
ymm_fx
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vaddps
(
ymm_fx
,
ymm_fx
,
ymm_tmp
);
vroundps
(
ymm_fy
,
ymm_fx
,
0x01
);
// if greater, substract 1
vcmpgtps
(
ymm_mask
,
ymm_fy
,
ymm_fx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
vandps
(
ymm_mask
,
ymm_mask
,
ymm_tmp
);
vsubps
(
ymm_fx
,
ymm_fy
,
ymm_mask
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C1
]);
vmulps
(
ymm_fy
,
ymm_fx
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
ymm_t
ymm_z
=
ymm_t
(
ymm_mask
.
getIdx
());
vmulps
(
ymm_z
,
ymm_fx
,
ymm_tmp
);
vsubps
(
ymm_src
,
ymm_src
,
ymm_fy
);
vsubps
(
ymm_src
,
ymm_src
,
ymm_z
);
vmulps
(
ymm_z
,
ymm_src
,
ymm_src
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmulps
(
ymm_dst
,
ymm_src
,
ymm_tmp
);
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_src
);
}
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_z
);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_src
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
// build 2^n
ymm_t
ymm_int
=
ymm_fx
;
vcvttps2dq
(
ymm_int
,
ymm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
ymm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
))
{
vpaddd
(
ymm_int
,
ymm_int
,
ymm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
ymm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
mov
(
reg_ptr_tmp
,
reinterpret_cast
<
size_t
>
(
g_tmp_mem
));
vmovdqa
(
ptr
[
reg_ptr_tmp
],
ymm_int
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
YMM_FLOAT_BLOCK
*
sizeof
(
float
)],
ymm_tmp
);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
],
xtmp1
);
// next 128bits
vmovdqa
(
xtmp1
,
ptr
[
reg_ptr_tmp
+
4
/*xmm float block*/
*
sizeof
(
float
)]);
vmovdqa
(
xtmp2
,
ptr
[
reg_ptr_tmp
+
(
YMM_FLOAT_BLOCK
+
4
/*xmm float block*/
)
*
sizeof
(
float
)]);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
4
/*xmm float block*/
*
sizeof
(
float
)],
xtmp1
);
// load out
vmovdqa
(
ymm_int
,
ptr
[
reg_ptr_tmp
]);
}
vmulps
(
ymm_dst
,
ymm_dst
,
ymm_int
);
pop
(
reg_ptr_global
);
}
void
VActJitCode
::
sigmoid_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
// y = 1 / (1 + e^-x)
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vminps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmaxps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
vxorps
(
ymm_tmp
,
ymm_tmp
,
ymm_tmp
);
vsubps
(
ymm_src
,
ymm_tmp
,
ymm_src
);
exp_ymm
(
ymm_dst
,
ymm_src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vdivps
(
ymm_dst
,
ymm_tmp
,
ymm_dst
);
pop
(
reg_ptr_global
);
}
void
VActJitCode
::
tanh_ymm
(
ymm_t
&
ymm_dst
,
ymm_t
&
ymm_src
,
int
fx_idx
,
int
fy_idx
,
int
mask_idx
,
int
tmp_idx
)
{
// y = 2 / (1 + e^(-2x)) - 1
ymm_t
ymm_tmp
=
ymm_t
(
tmp_idx
);
ymm_t
ymm_zero
=
ymm_t
(
mask_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vxorps
(
ymm_zero
,
ymm_zero
,
ymm_zero
);
vsubps
(
ymm_tmp
,
ymm_zero
,
ymm_tmp
);
vmulps
(
ymm_src
,
ymm_src
,
ymm_tmp
);
exp_ymm
(
ymm_dst
,
ymm_src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vdivps
(
ymm_dst
,
ymm_tmp
,
ymm_dst
);
vmovaps
(
ymm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vsubps
(
ymm_dst
,
ymm_dst
,
ymm_tmp
);
pop
(
reg_ptr_global
);
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
return
MayIUse
(
avx
);
}
void
VActJitCode
::
generate
()
{
...
...
@@ -324,16 +149,16 @@ void VActJitCode::generate() {
vmovups
(
ymm_src
,
ptr
[
param1
+
offset
]);
switch
(
type_
)
{
case
operand_type
::
relu
:
relu_
ymm
(
ymm_dst
,
ymm_src
,
ymm_zero
);
relu_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
ymm_zero
);
break
;
case
operand_type
::
exp
:
exp_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
exp_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
sigmoid_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_
ymm
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
tanh_
jmm
<
ymm_t
>
(
ymm_dst
,
ymm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
identity
:
break
;
...
...
@@ -343,31 +168,45 @@ void VActJitCode::generate() {
vmovups
(
ptr
[
param2
+
offset
],
ymm_dst
);
offset
+=
sizeof
(
float
)
*
YMM_FLOAT_BLOCK
;
}
if
(
type_
!=
operand_type
::
relu
)
{
// TODO(TJ): remove me
ret
();
return
;
}
int
rest
=
num_
%
YMM_FLOAT_BLOCK
;
while
(
rest
>
0
)
{
int
block
=
XMM_FLOAT_BLOCK
;
if
(
rest
>=
4
)
{
block
=
4
;
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
}
else
if
(
rest
>=
2
)
{
block
=
2
;
vmovq
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
else
{
block
=
1
;
vmovss
(
xmm_src
,
ptr
[
param1
+
offset
]);
}
if
(
rest
>=
2
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
vmovq
(
ptr
[
param2
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
switch
(
type_
)
{
case
operand_type
::
relu
:
relu_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
xmm_zero
);
break
;
case
operand_type
::
exp
:
exp_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
sigmoid
:
sigmoid_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
case
operand_type
::
tanh
:
tanh_jmm
<
xmm_t
>
(
xmm_dst
,
xmm_src
,
2
,
3
,
4
,
5
);
break
;
default:
break
;
}
if
(
rest
>
0
)
{
vmovups
(
xmm_src
,
ptr
[
param1
+
offset
]);
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_src
);
if
(
rest
>=
4
)
{
vmovups
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
else
if
(
rest
>=
2
)
{
vmovq
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
else
{
vmovss
(
ptr
[
param2
+
offset
],
xmm_dst
);
}
offset
+=
sizeof
(
float
)
*
block
;
rest
-=
block
;
}
ret
();
}
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
b5c44fd4
...
...
@@ -16,6 +16,8 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -40,6 +42,51 @@ typedef enum {
identity
}
operand_type
;
extern
const
float
exp_float_consts
[];
extern
const
int
exp_int_0x7f
[];
extern
int
g_tmp_mem
[];
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
class
VXXJitCode
:
public
JitCode
{
public:
...
...
@@ -127,21 +174,140 @@ class VActJitCode : public JitCode {
void
generate
()
override
;
protected:
// compute relu with ymm
void
relu_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
const
Xbyak
::
Ymm
&
zero
);
// compute relu with ymm, xmm
template
<
typename
JMM
>
void
relu_jmm
(
JMM
&
dst
,
JMM
&
src
,
JMM
&
zero
)
{
// NOLINT
vmaxps
(
dst
,
src
,
zero
);
}
// compute exp with ymm
void
exp_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute exp with ymm, xmm
template
<
typename
JMM
>
void
exp_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
using
namespace
platform
::
jit
;
// NOLINT
assert
(
src
.
getIdx
()
!=
dst
.
getIdx
());
// TODO(TJ): use enfore
// check all idx can not equal
JMM
jmm_fx
=
JMM
(
fx_idx
);
JMM
jmm_fy
=
JMM
(
fy_idx
);
JMM
jmm_mask
=
JMM
(
mask_idx
);
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_HIG
]);
vminps
(
src
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOW
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
// express exp(x) as exp(g + n*log(2))
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_LOG2EF
]);
vmulps
(
jmm_fx
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_0P5
]);
vaddps
(
jmm_fx
,
jmm_fx
,
jmm_tmp
);
vroundps
(
jmm_fy
,
jmm_fx
,
0x01
);
// if greater, substract 1
vcmpgtps
(
jmm_mask
,
jmm_fy
,
jmm_fx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vandps
(
jmm_mask
,
jmm_mask
,
jmm_tmp
);
vsubps
(
jmm_fx
,
jmm_fy
,
jmm_mask
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C1
]);
vmulps
(
jmm_fy
,
jmm_fx
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_C2
]);
JMM
ymm_z
=
JMM
(
jmm_mask
.
getIdx
());
vmulps
(
ymm_z
,
jmm_fx
,
jmm_tmp
);
vsubps
(
src
,
src
,
jmm_fy
);
vsubps
(
src
,
src
,
ymm_z
);
vmulps
(
ymm_z
,
src
,
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P0
]);
vmulps
(
dst
,
src
,
jmm_tmp
);
for
(
size_t
i
=
OFFSET_EXP_P1
;
i
<
OFFSET_EXP_P5
;
i
+=
(
YMM_FLOAT_BLOCK
*
sizeof
(
float
)))
{
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
i
]);
// P1~P4
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
src
);
}
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_P5
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmulps
(
dst
,
dst
,
ymm_z
);
vaddps
(
dst
,
dst
,
src
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
// build 2^n
JMM
ymm_int
=
jmm_fx
;
vcvttps2dq
(
ymm_int
,
jmm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
vpaddd
(
ymm_int
,
ymm_int
,
jmm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
jmm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
mov
(
reg_ptr_tmp
,
reinterpret_cast
<
size_t
>
(
g_tmp_mem
));
vmovdqa
(
ptr
[
reg_ptr_tmp
],
ymm_int
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
YMM_FLOAT_BLOCK
*
sizeof
(
float
)],
jmm_tmp
);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
],
xtmp1
);
// next 128bits
vmovdqa
(
xtmp1
,
ptr
[
reg_ptr_tmp
+
XMM_FLOAT_BLOCK
*
sizeof
(
float
)]);
vmovdqa
(
xtmp2
,
ptr
[
reg_ptr_tmp
+
(
YMM_FLOAT_BLOCK
+
XMM_FLOAT_BLOCK
)
*
sizeof
(
float
)]);
vpaddd
(
xtmp1
,
xtmp1
,
xtmp2
);
vpslld
(
xtmp1
,
xtmp1
,
23
);
vmovdqa
(
ptr
[
reg_ptr_tmp
+
XMM_FLOAT_BLOCK
*
sizeof
(
float
)],
xtmp1
);
// load out
vmovdqa
(
ymm_int
,
ptr
[
reg_ptr_tmp
]);
}
vmulps
(
dst
,
dst
,
ymm_int
);
pop
(
reg_ptr_global
);
}
// compute sigmoid with ymm
void
sigmoid_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute sigmoid with ymm, xmm
template
<
typename
JMM
>
void
sigmoid_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
// NOLINT
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
// y = 1 / (1 + e^-x)
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MAX
]);
vminps
(
src
,
src
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_SIGMOID_MIN
]);
vmaxps
(
src
,
src
,
jmm_tmp
);
vxorps
(
jmm_tmp
,
jmm_tmp
,
jmm_tmp
);
vsubps
(
src
,
jmm_tmp
,
src
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vdivps
(
dst
,
jmm_tmp
,
dst
);
pop
(
reg_ptr_global
);
}
// compute tanh with ymm
void
tanh_ymm
(
const
Xbyak
::
Ymm
&
dst
,
const
Xbyak
::
Ymm
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
int
mask_idx
=
4
,
int
tmp_idx
=
5
);
// compute tanh with ymm, xmm
template
<
typename
JMM
>
void
tanh_jmm
(
JMM
&
dst
,
JMM
&
src
,
int
fx_idx
=
2
,
int
fy_idx
=
3
,
// NOLINT
int
mask_idx
=
4
,
int
tmp_idx
=
5
)
{
// y = 2 / (1 + e^(-2x)) - 1
JMM
jmm_tmp
=
JMM
(
tmp_idx
);
JMM
jmm_zero
=
JMM
(
mask_idx
);
reg64_t
reg_ptr_global
=
rax
;
push
(
reg_ptr_global
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vxorps
(
jmm_zero
,
jmm_zero
,
jmm_zero
);
vsubps
(
jmm_tmp
,
jmm_zero
,
jmm_tmp
);
vmulps
(
src
,
src
,
jmm_tmp
);
exp_jmm
<
JMM
>
(
dst
,
src
,
fx_idx
,
fy_idx
,
mask_idx
,
tmp_idx
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vaddps
(
dst
,
dst
,
jmm_tmp
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_TWO
]);
vdivps
(
dst
,
jmm_tmp
,
dst
);
vmovaps
(
jmm_tmp
,
ptr
[
reg_ptr_global
+
OFFSET_EXP_ONE
]);
vsubps
(
dst
,
dst
,
jmm_tmp
);
pop
(
reg_ptr_global
);
}
protected:
int
num_
;
...
...
@@ -156,6 +322,42 @@ class VActJitCode : public JitCode {
ymm_t
ymm_dst
=
ymm_t
(
1
);
};
#ifdef PADDLE_WITH_MKLDNN
struct
EltwiseMulnChw16cNC
:
public
Xbyak
::
CodeGenerator
{
explicit
EltwiseMulnChw16cNC
(
size_t
code_size
=
256
*
1024
)
:
Xbyak
::
CodeGenerator
(
code_size
)
{
// RDI is ptr x_input
// RSI is ptr y_input
// RDX is ptr output
// RCX is height
// r8 is width
push
(
rbx
);
xor_
(
rax
,
rax
);
xor_
(
r10
,
r10
);
vmovups
(
zmm3
,
ptr
[
rsi
]);
L
(
"h_loop"
);
xor_
(
rbx
,
rbx
);
L
(
"w_loop"
);
vmovups
(
zmm2
,
ptr
[
rdi
+
rax
]);
vmulps
(
zmm1
,
zmm2
,
zmm3
);
vmovups
(
ptr
[
rdx
+
rax
],
zmm1
);
add
(
rax
,
64
);
inc
(
rbx
);
cmp
(
r8
,
rbx
);
jnz
(
"w_loop"
);
inc
(
r10
);
cmp
(
r10
,
rcx
);
jnz
(
"h_loop"
);
pop
(
rbx
);
ret
();
}
};
#endif
}
// namespace gen
}
// namespace jitkernel
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
b5c44fd4
...
...
@@ -26,6 +26,7 @@ namespace operators {
namespace
math
{
namespace
jitkernel
{
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
...
...
@@ -94,6 +95,15 @@ class VAddBiasKernel : public Kernel {
void
(
*
Compute
)(
const
T
*
,
const
T
*
,
T
*
,
int
);
};
#ifdef PADDLE_WITH_MKLDNN
template
<
typename
T
>
class
EltwiseMulnChw16cNCKernel
:
public
Kernel
{
public:
// nChw16c = nChw16c .* NC
void
(
*
Compute
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
);
};
#endif
template
<
typename
T
>
class
VActKernel
:
public
Kernel
{
public:
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
b5c44fd4
...
...
@@ -226,6 +226,44 @@ bool VAddKernelImpl<double>::useMKL(int d) {
}
#endif
#ifdef PADDLE_WITH_MKLDNN
/* EltwiseMul for nChw16c & NC inputs JitKernel */
template
<
typename
T
>
class
EltwiseMulnChw16cNCKernelImpl
:
public
math
::
jitkernel
::
EltwiseMulnChw16cNCKernel
<
T
>
{
public:
JITKERNEL_DECLARE_STATIC_FUNC
;
explicit
EltwiseMulnChw16cNCKernelImpl
(
int
d
)
:
EltwiseMulnChw16cNCKernel
<
T
>
()
{
using
mul_func_t
=
void
(
*
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
);
#ifdef PADDLE_WITH_XBYAK
if
(
useJIT
(
d
))
{
// roughly estimate the size of code
size_t
sz
=
96
+
d
/
YMM_FLOAT_BLOCK
*
4
*
8
;
sz
=
sz
>
4096
?
sz
:
4096
;
jitcode_
.
reset
(
new
gen
::
EltwiseMulnChw16cNC
(
sz
));
this
->
Compute
=
(
mul_func_t
)
jitcode_
->
getCode
();
return
;
}
#endif
PADDLE_THROW
(
"This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN "
"environemnt"
);
}
#ifdef PADDLE_WITH_XBYAK
private:
std
::
unique_ptr
<
gen
::
EltwiseMulnChw16cNC
>
jitcode_
{
nullptr
};
};
template
<
>
bool
EltwiseMulnChw16cNCKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
true
;
}
#endif
#endif
/* VAddRelu JitKernel */
template
<
typename
T
>
class
VAddReluKernelImpl
:
public
VAddReluKernel
<
T
>
{
...
...
@@ -394,6 +432,9 @@ REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL
(
vaddbias
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
#ifdef PADDLE_WITH_MKLDNN
REGISTER_JITKERNEL
(
eltwise_mul_nchw16c
,
EltwiseMulnChw16cNCKernel
);
#endif
}
// namespace jitkernel
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
b5c44fd4
...
...
@@ -33,6 +33,9 @@ limitations under the License. */
constexpr
int
repeat
=
20000
;
// TODO(TJ): benchmark and test should be seperated,
// benchmark should verify more sizes
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
...
...
@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
3
,
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
10.
f
,
1.
f
);
...
...
@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
12
,
15
,
16
,
20
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -231,7 +234,7 @@ void vsigmoid_better(
TEST
(
JitKernel
,
vsigmoid
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
3
,
4
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -295,7 +298,7 @@ void vtanh_better(
TEST
(
JitKernel
,
vtanh
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -386,7 +389,7 @@ void lstm_ctht_better(
TEST
(
JitKernel
,
lstm
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
for
(
int
d
:
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
int
d4
=
d
*
4
;
int
d3
=
d
*
3
;
std
::
vector
<
float
>
x
(
d4
),
xref
(
d4
);
...
...
@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
vadd
relu
_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
b5c44fd4
...
...
@@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) {
:
(
std
::
is_same
<
size_t
,
unsigned
long
>::
value
// NOLINT
?
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzl
(
x
)
:
0
)
:
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzll
(
x
)
:
0
));
}
#else
// windows don't have built-in clz, ctz function
template
<
typename
T
>
...
...
@@ -92,7 +92,6 @@ inline int clz(const T& value) {
inline
size_t
FindLastSet
(
size_t
x
)
{
return
sizeof
(
size_t
)
*
8
-
clz
(
x
);
}
#endif // !_WIN32
}
struct
SimpleCode
{
SimpleCode
(
size_t
code
,
size_t
num_classes
)
:
c_
(
code
+
num_classes
)
{}
...
...
paddle/fluid/operators/math/pooling.cu
浏览文件 @
b5c44fd4
...
...
@@ -153,6 +153,37 @@ __global__ void KernelMaxPool2DGrad(
}
}
template
<
typename
PoolProcess
,
typename
T
>
void
Pool2dDirectCUDAFunctor
<
PoolProcess
,
T
>::
operator
()(
const
T
*
input
,
const
std
::
vector
<
int
>&
input_shape
,
const
std
::
vector
<
int
>&
output_shape
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
T
*
output
,
cudaStream_t
stream
)
{
const
int
batch_size
=
input_shape
[
0
];
const
int
input_channels
=
input_shape
[
1
];
const
int
input_height
=
input_shape
[
2
];
const
int
input_width
=
input_shape
[
3
];
const
int
output_channels
=
output_shape
[
1
];
const
int
output_height
=
output_shape
[
2
];
const
int
output_width
=
output_shape
[
3
];
const
int
ksize_height
=
ksize
[
0
];
const
int
ksize_width
=
ksize
[
1
];
const
int
stride_height
=
strides
[
0
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
int
nthreads
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
KernelPool2D
<
PoolProcess
,
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
nthreads
,
input
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_compute
,
exclusive
,
output
);
}
/*
* All tensors are in NCHW format.
* Ksize, strides, paddings are two elements. These two elements represent
...
...
@@ -291,6 +322,11 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
}
};
template
class
Pool2dDirectCUDAFunctor
<
paddle
::
operators
::
math
::
MaxPool
<
float
>,
float
>
;
template
class
Pool2dDirectCUDAFunctor
<
paddle
::
operators
::
math
::
AvgPool
<
float
>,
float
>
;
template
class
MaxPool2dGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxPool2dGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
...
...
paddle/fluid/operators/math/pooling.h
浏览文件 @
b5c44fd4
...
...
@@ -82,6 +82,19 @@ class AvgPoolGrad {
* This is different from average pooling. So we rewrite the max_pool_grad:
* MaxPool2dGradFunctor, MaxPool3dGradFunctor.
*/
#ifdef PADDLE_WITH_CUDA
template
<
typename
PoolProcess
,
typename
T
>
class
Pool2dDirectCUDAFunctor
{
public:
void
operator
()(
const
T
*
input
,
const
std
::
vector
<
int
>&
input_shape
,
const
std
::
vector
<
int
>&
output_shape
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
T
*
output
,
cudaStream_t
stream
);
};
#endif
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
class
Pool2dFunctor
{
public:
...
...
paddle/fluid/operators/math/softmax.h
浏览文件 @
b5c44fd4
...
...
@@ -19,7 +19,8 @@ namespace paddle {
namespace
operators
{
namespace
math
{
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
,
typename
Enable
=
void
>
class
SoftmaxFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
...
...
paddle/fluid/operators/math/softmax_impl.h
浏览文件 @
b5c44fd4
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -32,8 +33,8 @@ struct ValueClip {
}
};
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
>
void
SoftmaxFunctor
<
DeviceContext
,
T
,
is_test
>::
operator
()(
template
<
typename
DeviceContext
,
typename
T
,
bool
is_test
,
typename
Enable
>
void
SoftmaxFunctor
<
DeviceContext
,
T
,
is_test
,
Enable
>::
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
...
...
@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
.
broadcast
(
one_by_class
));
}
template
<
typename
DeviceContext
,
typename
T
>
class
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
{
template
<
class
DeviceContext
>
using
enable_if_CPU
=
typename
std
::
enable_if
<
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
;
template
<
typename
DeviceContext
>
class
SoftmaxFunctor
<
DeviceContext
,
float
,
true
,
enable_if_CPU
<
DeviceContext
>>
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
auto
logits
=
EigenMatrix
<
T
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
T
>::
From
(
*
Y
);
auto
in_dims
=
X
->
dims
();
auto
out_dims
=
Y
->
dims
();
const
float
*
in_data
=
X
->
data
<
float
>
();
float
*
out_data
=
Y
->
data
<
float
>
();
const
int
kBatchDim
=
0
;
const
int
kClassDim
=
1
;
// 2D data. Batch x C
const
int
batch_size
=
in_dims
[
kBatchDim
];
const
int
num_classes
=
in_dims
[
kClassDim
];
std
::
vector
<
float
>
entities
(
batch_size
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
float
>
(
context
);
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
entities
[
n
]
=
in_data
[
n
*
num_classes
];
for
(
int
c
=
1
;
c
<
num_classes
;
++
c
)
{
entities
[
n
]
=
in_data
[
n
*
num_classes
+
c
]
>
entities
[
n
]
?
in_data
[
n
*
num_classes
+
c
]
:
entities
[
n
];
}
for
(
int
c
=
0
;
c
<
num_classes
;
++
c
)
{
out_data
[
n
*
num_classes
+
c
]
=
in_data
[
n
*
num_classes
+
c
]
-
entities
[
n
];
}
}
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
Eigen
::
DSizes
<
int
,
1
>
along_class
(
kClassDim
);
Eigen
::
DSizes
<
int
,
2
>
batch_by_one
(
batch_size
,
1
);
Eigen
::
DSizes
<
int
,
2
>
one_by_class
(
1
,
num_classes
);
auto
shifted_logits
=
(
logits
-
logits
.
maximum
(
along_class
)
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
softmax
.
device
(
*
context
.
eigen_device
())
=
shifted_logits
.
exp
();
softmax
.
device
(
*
context
.
eigen_device
())
=
(
softmax
*
softmax
.
sum
(
along_class
)
.
inverse
()
.
eval
()
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
));
blas
.
VEXP
(
num_classes
*
batch_size
,
out_data
,
out_data
);
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
entities
[
n
]
=
out_data
[
n
*
num_classes
];
for
(
int
c
=
1
;
c
<
num_classes
;
++
c
)
{
entities
[
n
]
+=
out_data
[
n
*
num_classes
+
c
];
}
blas
.
SCAL
(
num_classes
,
1.0
f
/
entities
[
n
],
&
out_data
[
n
*
num_classes
]);
}
}
};
...
...
paddle/fluid/operators/reader/create_py_reader_op.cc
浏览文件 @
b5c44fd4
paddle/fluid/operators/roi_align_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel {
"The format of input tensor is NCHW."
);
PADDLE_ENFORCE
(
rois_dims
.
size
()
==
2
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]."
);
"given as [[x1, y1, x2, y2],
...
]."
);
PADDLE_ENFORCE
(
rois_dims
[
1
]
==
4
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]."
);
"given as [[x1, y1, x2, y2],
...
]."
);
int
pooled_height
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_height"
);
int
pooled_width
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_width"
);
float
spatial_scale
=
ctx
->
Attrs
().
Get
<
float
>
(
"spatial_scale"
);
...
...
@@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor), "
"ROIs (Regions of Interest) to pool over. "
"should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]. "
"given as [[x1, y1, x2, y2],
...
]. "
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates."
);
AddOutput
(
"Out"
,
...
...
paddle/fluid/operators/roi_pool_op.cc
浏览文件 @
b5c44fd4
...
...
@@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
"The format of input tensor is NCHW."
);
PADDLE_ENFORCE
(
rois_dims
.
size
()
==
2
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]."
);
"given as [[x1, y1, x2, y2],
...
]."
);
PADDLE_ENFORCE
(
rois_dims
[
1
]
==
kROISize
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]."
);
"given as [[x1, y1, x2, y2],
...
]."
);
int
pooled_height
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_height"
);
int
pooled_width
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_width"
);
...
...
@@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor), "
"ROIs (Regions of Interest) to pool over. "
"should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2],
…
]. "
"given as [[x1, y1, x2, y2],
...
]. "
"Where batch_id is the id of the data, "
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates."
);
...
...
paddle/fluid/operators/softmax_op.h
浏览文件 @
b5c44fd4
...
...
@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor
X_2d
=
framework
::
ReshapeToMatrix
(
*
X
,
rank
-
1
);
Tensor
Out_2d
=
framework
::
ReshapeToMatrix
(
*
Out
,
rank
-
1
);
#ifdef ON_INFER
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
true
>
()(
#ifdef PADDLE_ON_INFERENCE
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>
()(
context
.
template
device_context
<
DeviceContext
>(),
&
X_2d
,
&
Out_2d
);
#else
math
::
SoftmaxFunctor
<
DeviceContext
,
T
,
false
>
()(
...
...
paddle/fluid/operators/space_to_depth_op.cc
浏览文件 @
b5c44fd4
paddle/fluid/operators/stack_op.h
浏览文件 @
b5c44fd4
...
...
@@ -147,20 +147,32 @@ class StackKernel : public framework::OpKernel<T> {
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
#ifdef __NVCC__
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
thrust
::
device_vector
<
const
T
*>
device_x_vec
(
x_datas
);
auto
x_data_arr
=
device_x_vec
.
data
().
get
();
#else
auto
x_data_arr
=
x_datas
.
data
();
#endif
StackFunctorForRange
(
dev_ctx
,
x_data_arr
,
y_data
,
total_num
,
n
,
post
);
#ifdef __NVCC__
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#else
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
y_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
#endif
}
};
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
b5c44fd4
if
(
NOT WIN32
)
proto_library
(
profiler_proto SRCS profiler.proto DEPS framework_proto
)
py_proto_compile
(
profiler_py_proto SRCS profiler.proto
)
...
...
@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
add_dependencies
(
profiler_py_proto profiler_py_proto_init
)
if
(
NOT WIN32
)
add_custom_command
(
TARGET profiler_py_proto POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E make_directory
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler
COMMAND cp *.py
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
else
(
NOT WIN32
)
string
(
REPLACE
"/"
"
\\
"
proto_dstpath
"
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler/"
)
add_custom_command
(
TARGET profiler_py_proto POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E make_directory
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler
COMMAND copy /Y *.py
${
proto_dstpath
}
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
...
...
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
if
(
NOT WIN32
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context device_tracer
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
endif
(
NOT WIN32
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
...
...
paddle/fluid/platform/cpu_helper.cc
浏览文件 @
b5c44fd4
...
...
@@ -29,12 +29,19 @@ namespace platform {
void
SetNumThreads
(
int
num_threads
)
{
#ifdef PADDLE_USE_OPENBLAS
// windows has no support for openblas multi-thread
// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
#ifdef _WIN32
if
(
num_threads
>
1
)
{
num_threads
=
1
;
}
#endif
int
real_num_threads
=
num_threads
>
1
?
num_threads
:
1
;
openblas_set_num_threads
(
real_num_threads
);
#elif defined(PADDLE_WITH_MKLML)
int
real_num_threads
=
num_threads
>
1
?
num_threads
:
1
;
platform
::
dynload
::
MKL_Set_Num_Threads
(
real_num_threads
);
omp_set_num_threads
(
num_threads
);
omp_set_num_threads
(
real_
num_threads
);
#else
PADDLE_ENFORCE
(
false
,
"To be implemented."
);
#endif
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
b5c44fd4
...
...
@@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
std
::
unique_ptr
<
std
::
lock_guard
<
std
::
mutex
>>
guard_
;
};
#if CUDA_VERSION >= 9000
class
ScopedCublasMathMode
{
public:
ScopedCublasMathMode
(
cublasHandle_t
handle
,
cublasMath_t
new_math_mode
)
:
handle_
(
handle
)
{
need_reset
=
false
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGetMathMode
(
handle_
,
&
old_math_mode_
),
"Failed to get old cublas math mode"
);
if
(
old_math_mode_
!=
new_math_mode
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
new_math_mode
),
"Failed to set old cublas math mode"
);
need_reset
=
true
;
}
}
~
ScopedCublasMathMode
()
{
if
(
need_reset
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
old_math_mode_
),
"Failed to set old cublas math mode"
);
}
}
private:
cublasHandle_t
handle_
;
cublasMath_t
old_math_mode_
;
bool
need_reset
;
};
#endif
class
CUDADeviceContext
:
public
DeviceContext
{
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
...
...
@@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
callback_manager_
->
Wait
();
}
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template
<
typename
Callback
>
void
CublasCall
(
Callback
callback
,
cublasMath_t
new_math
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
cublas_mtx_
);
ScopedCublasMathMode
scoped_cublas_math
(
cublas_handle_
,
new_math
);
callback
();
}
#endif
private:
CUDAPlace
place_
;
...
...
@@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable
std
::
mutex
callback_mtx_
;
std
::
unique_ptr
<
StreamCallbackManager
>
callback_manager_
;
mutable
std
::
mutex
cublas_mtx_
;
};
template
<
>
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
b5c44fd4
...
...
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h>
#include <chrono> // NOLINT
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace
paddle
{
...
...
@@ -32,15 +26,11 @@ namespace platform {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
#if !defined(_WIN32)
inline
uint64_t
PosixInNsec
()
{
struct
timeval
tv
;
gettimeofday
(
&
tv
,
nullptr
);
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
#else
inline
uint64_t
PosixInNsec
()
{
return
static_cast
<
uint64_t
>
(
0
);
}
#endif // !_WIN32
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
...
...
paddle/fluid/platform/dynload/cublas.cc
浏览文件 @
b5c44fd4
...
...
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DEFINE_WRAP
);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DEFINE_WRAP
);
#endif
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/cublas.h
浏览文件 @
b5c44fd4
...
...
@@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
extern DynLoad__##__name __name
#endif
#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
...
...
@@ -106,11 +103,22 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
// APIs available after CUDA 9.1
#if CUDA_VERSION >= 9010
#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
}
// namespace dynload
}
// namespace platform
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
b5c44fd4
...
...
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h>
#include <cudnn.h>
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
b5c44fd4
...
...
@@ -18,12 +18,6 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
...
...
@@ -127,14 +121,14 @@ struct EOFException : public std::exception {
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition
== 0
)
#define UNLIKELY(condition) (condition)
#endif
#if !defined(_WIN32)
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else
// there is no equivalent intrinsics in msvc.
#define LIKELY(condition) (condition
!= 0
)
#define LIKELY(condition) (condition)
#endif
template
<
typename
...
Args
>
...
...
@@ -248,7 +242,6 @@ inline void throw_on_error(T e) {
throw_on_error
(
e
,
""
);
}
#if !defined(_WIN32)
#define PADDLE_THROW(...) \
do { \
throw ::paddle::platform::EnforceNotMet( \
...
...
@@ -272,17 +265,6 @@ inline void throw_on_error(T e) {
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
...
...
@@ -302,20 +284,6 @@ inline void throw_on_error(T e) {
* extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
*/
#if !defined(_WIN32)
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \
if (UNLIKELY(nullptr == (__VAL))) { \
...
...
@@ -335,27 +303,19 @@ inline void throw_on_error(T e) {
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} while (0)
#else
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
if (!((__VAL0)__CMP(__VAL1))) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
} \
} while (0)
#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
do { \
if (nullptr == (__VAL1)) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
} \
} while (0)
#endif // !_WIN32
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/gpu_info.cc
浏览文件 @
b5c44fd4
...
...
@@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
DEFINE_bool
(
enable_cublas_tensor_op_math
,
false
,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that"
" use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
" GEMM computations(the matrices must be either half precision or single "
"precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
"input and output must be half precision) and recurrent neural networks "
"(RNNs)."
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) {
return
driver_version
;
}
bool
TensorCoreAvailable
()
{
#if CUDA_VERSION >= 9000
int
device
=
GetCurrentDeviceId
();
int
driver_version
=
GetCUDAComputeCapability
(
device
);
return
driver_version
>=
70
;
#else
return
false
;
#endif
}
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
b5c44fd4
...
...
@@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
//! Get the driver version of the ith GPU
int
GetCUDADriverVersion
(
int
id
);
//! Wheter the current device support TensorCore
bool
TensorCoreAvailable
();
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
...
...
paddle/fluid/platform/init.cc
浏览文件 @
b5c44fd4
...
...
@@ -38,6 +38,7 @@ std::once_flag p2p_init_flag;
void
InitGflags
(
std
::
vector
<
std
::
string
>
argv
)
{
std
::
call_once
(
gflags_init_flag
,
[
&
]()
{
FLAGS_logtostderr
=
true
;
argv
.
insert
(
argv
.
begin
(),
"dummy"
);
int
argc
=
argv
.
size
();
char
**
arr
=
new
char
*
[
argv
.
size
()];
...
...
@@ -116,13 +117,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places
.
emplace_back
(
platform
::
CPUPlace
());
platform
::
DeviceContextPool
::
Init
(
places
);
// windows has no support for openblas multi-thread
#ifdef _WIN32
if
(
FLAGS_paddle_num_threads
>
1
)
{
FLAGS_paddle_num_threads
=
1
;
}
#endif
#ifndef PADDLE_WITH_MKLDNN
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
#endif
...
...
paddle/fluid/platform/init.h
浏览文件 @
b5c44fd4
...
...
@@ -16,9 +16,6 @@ limitations under the License. */
#include <string>
#include <vector>
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "gflags/gflags.h"
#include "glog/logging.h"
...
...
paddle/fluid/platform/port.h
浏览文件 @
b5c44fd4
...
...
@@ -17,6 +17,7 @@
#include <cstdio>
#include <stdexcept>
#include <time.h>
#include <memory>
#include <string>
...
...
@@ -27,8 +28,13 @@
#include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace
#include <sys/stat.h>
#include <sys/time.h>
#include <algorithm> // std::accumulate
#else
#define NOMINMAX // msvc max/min macro conflict with std::min/max
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include <io.h> // _popen, _pclose
#include <stdio.h>
#include <windows.h>
...
...
@@ -57,6 +63,25 @@ static void *dlopen(const char *filename, int flag) {
return
reinterpret_cast
<
void
*>
(
hModule
);
}
static
int
gettimeofday
(
struct
timeval
*
tp
,
void
*
tzp
)
{
time_t
clock
;
struct
tm
tm
;
SYSTEMTIME
wtm
;
GetLocalTime
(
&
wtm
);
tm
.
tm_year
=
wtm
.
wYear
-
1900
;
tm
.
tm_mon
=
wtm
.
wMonth
-
1
;
tm
.
tm_mday
=
wtm
.
wDay
;
tm
.
tm_hour
=
wtm
.
wHour
;
tm
.
tm_min
=
wtm
.
wMinute
;
tm
.
tm_sec
=
wtm
.
wSecond
;
tm
.
tm_isdst
=
-
1
;
clock
=
mktime
(
&
tm
);
tp
->
tv_sec
=
clock
;
tp
->
tv_usec
=
wtm
.
wMilliseconds
*
1000
;
return
(
0
);
}
#endif // !_WIN32
static
void
ExecShellCommand
(
const
std
::
string
&
cmd
,
std
::
string
*
message
)
{
...
...
@@ -132,11 +157,13 @@ static void MkDir(const char *path) {
}
}
#else
CreateDirectory
(
path
,
NULL
);
BOOL
return_value
=
CreateDirectory
(
path
,
NULL
);
if
(
!
return_value
)
{
auto
errorno
=
GetLastError
();
if
(
errorno
!=
ERROR_ALREADY_EXISTS
)
{
throw
std
::
runtime_error
(
path_error
);
}
}
#endif // !_WIN32
}
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
b5c44fd4
...
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/port.h"
#include <sys/time.h>
#include <algorithm>
#include <iomanip>
#include <limits>
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
b5c44fd4
...
...
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
#if !defined(_WIN32)
struct
RecordEvent
{
// dev_ctx can be set to nullptr if device is cpu.
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
...
...
@@ -106,15 +105,6 @@ struct RecordBlock {
std
::
string
name_
;
uint64_t
start_ns_
;
};
#else
// windows do not support profiler temporarily.
struct
RecordEvent
{
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{}
};
struct
RecordBlock
{
explicit
RecordBlock
(
int
block_id
)
{}
};
#endif
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
...
...
paddle/fluid/platform/stream_callback_manager.h
浏览文件 @
b5c44fd4
...
...
@@ -45,16 +45,15 @@ class StreamCallbackManager {
inline
void
AddCallback
(
Callback
&&
callback
)
const
{
auto
*
stream_callback_context
=
new
StreamCallbackContext
(
this
,
std
::
forward
<
Callback
>
(
callback
));
PADDLE_ENFORCE
(
#if CUDA_VERSION >= 10000
cudaLaunchHostFunc
(
stream_
,
StreamCallbackManager
::
StreamCallbackFunc
,
stream_callback_context
)
#else
cudaStreamAddCallback
(
stream_
,
PADDLE_ENFORCE
(
cudaLaunchHostFunc
(
stream_
,
StreamCallbackManager
::
StreamCallbackFunc
,
stream_callback_context
,
0
)
stream_callback_context
));
// NOLINT
#else
PADDLE_ENFORCE
(
cudaStreamAddCallback
(
stream_
,
StreamCallbackManager
::
StreamCallbackFunc
,
stream_callback_context
,
0
));
// NOLINT
#endif
);
// NOLINT
}
void
Wait
()
const
{
thread_pool_
.
reset
(
new
ThreadPool
(
1
));
}
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
b5c44fd4
set
(
PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc
)
if
(
NOT WIN32
)
list
(
APPEND PYBIND_DEPS parallel_executor profiler
)
list
(
APPEND PYBIND_SRCS recordio.cc
)
endif
(
NOT WIN32
)
set
(
PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
)
if
(
WITH_PYTHON
)
if
(
WITH_AMD_GPU
)
hip_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
DEPS
ARCHIVE_START
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
ARCHIVE_END
)
else
()
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b5c44fd4
...
...
@@ -21,13 +21,6 @@ limitations under the License. */
#include <utility>
#include <vector>
#if defined(_WIN32)
#define NOMINMAX
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#include <Windows.h>
#endif
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
...
...
@@ -36,9 +29,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#ifndef _WIN32
#include "paddle/fluid/framework/parallel_executor.h"
#endif
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
...
...
@@ -46,6 +37,7 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -95,6 +87,9 @@ bool IsCompiledWithDIST() {
}
PYBIND11_PLUGIN
(
core
)
{
// Not used, just make sure cpu_info.cc is linked.
paddle
::
platform
::
CpuTotalPhysicalMemory
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
py
::
module
m
(
"core"
,
"C++ core of PaddlePaddle"
);
...
...
@@ -359,19 +354,16 @@ All parameter, weight, gradient are variables in Paddle.
return
self
.
GetMutable
<
platform
::
Communicator
>
();
},
py
::
return_value_policy
::
reference
)
#endif
.
def
(
"get_reader"
,
[](
Variable
&
self
)
->
framework
::
ReaderHolder
*
{
PADDLE_ENFORCE
(
self
.
IsType
<
framework
::
ReaderHolder
>
());
return
self
.
GetMutable
<
framework
::
ReaderHolder
>
();
},
py
::
return_value_policy
::
reference
)
#endif
;
py
::
return_value_policy
::
reference
);
#if !defined(_WIN32)
py
::
class_
<
framework
::
ReaderHolder
>
(
m
,
"Reader"
,
""
)
.
def
(
"reset"
,
&
framework
::
ReaderHolder
::
ResetAll
);
#endif
using
LoDTensorBlockingQueue
=
::
paddle
::
operators
::
reader
::
LoDTensorBlockingQueue
;
...
...
@@ -640,7 +632,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
#ifndef _WIN32
py
::
enum_
<
platform
::
ProfilerState
>
(
m
,
"ProfilerState"
,
py
::
arithmetic
())
.
value
(
"kDisabled"
,
platform
::
ProfilerState
::
kDisabled
)
.
value
(
"kCPU"
,
platform
::
ProfilerState
::
kCPU
)
...
...
@@ -661,7 +652,6 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"disable_profiler"
,
platform
::
DisableProfiler
);
m
.
def
(
"is_profiler_enabled"
,
platform
::
IsProfileEnabled
);
m
.
def
(
"reset_profiler"
,
platform
::
ResetProfiler
);
#endif
py
::
class_
<
ir
::
Pass
,
std
::
shared_ptr
<
ir
::
Pass
>>
pass
(
m
,
"Pass"
);
pass
.
def
(
py
::
init
())
...
...
@@ -690,7 +680,6 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"remove_pass"
,
[](
ir
::
PassBuilder
&
self
,
size_t
idx
)
{
self
.
RemovePass
(
idx
);
});
#ifndef _WIN32
// -- python binds for parallel executor.
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
...
...
@@ -918,7 +907,6 @@ All parameter, weight, gradient are variables in Paddle.
});
BindRecordIOWriter
(
&
m
);
#endif
return
m
.
ptr
();
}
}
// namespace pybind
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
b5c44fd4
...
...
@@ -94,6 +94,30 @@ function cmake_gen() {
else
exit
1
fi
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.6"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.7"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
fi
else
if
[
"
$1
"
!=
""
]
;
then
...
...
@@ -116,6 +140,18 @@ function cmake_gen() {
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.6.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.6.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.7.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.7.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
fi
fi
fi
...
...
@@ -419,7 +455,7 @@ function assert_api_not_changed() {
source
.env/bin/activate
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
python
${
PADDLE_ROOT
}
/tools/print_signatures.py paddle.fluid
>
new.spec
if
[
"
$1
"
==
"cp35-cp35m"
]
;
then
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
;
then
# Use sed to make python2 and python3 sepc keeps the same
sed
-i
's/arg0: str/arg0: unicode/g'
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
b5c44fd4
...
...
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
for
(
int
i
=
0
;
i
<
argc
;
++
i
)
{
new_argv
.
push_back
(
argv
[
i
]);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
));
#else
...
...
python/paddle/fluid/__init__.py
浏览文件 @
b5c44fd4
...
...
@@ -115,9 +115,8 @@ def __bootstrap__():
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'cpu_deterministic'
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
]
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
...
...
@@ -130,11 +129,13 @@ def __bootstrap__():
read_env_flags
.
append
(
'rpc_send_thread_num'
)
read_env_flags
.
append
(
'rpc_get_thread_num'
)
read_env_flags
.
append
(
'rpc_prefetch_thread_num'
)
read_env_flags
.
append
(
'rpc_disable_reuse_port'
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
...
...
python/paddle/fluid/contrib/inferencer.py
浏览文件 @
b5c44fd4
...
...
@@ -15,15 +15,13 @@
from
__future__
import
print_function
import
contextlib
import
os
from
..
import
core
from
..
import
executor
from
..
import
framework
from
..
import
io
if
os
.
name
!=
'nt'
:
from
..
import
parallel_executor
from
..
import
parallel_executor
from
..
import
unique_name
from
.trainer
import
check_and_get_place
...
...
python/paddle/fluid/contrib/trainer.py
浏览文件 @
b5c44fd4
...
...
@@ -28,8 +28,7 @@ from .. import framework
from
..
import
io
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
from
..
import
optimizer
as
opt_module
if
os
.
name
!=
'nt'
:
from
..
import
parallel_executor
from
..
import
parallel_executor
from
..transpiler
import
distribute_transpiler
__all__
=
[
...
...
python/paddle/fluid/contrib/utils/__init__.py
0 → 100644
浏览文件 @
b5c44fd4
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
.
import
hdfs_utils
from
.hdfs_utils
import
*
__all__
=
hdfs_utils
.
__all__
python/paddle/fluid/contrib/utils/hdfs_utils.py
0 → 100644
浏览文件 @
b5c44fd4
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HDFS Utils"""
import
os
import
subprocess
import
multiprocessing
from
datetime
import
datetime
import
re
import
copy
import
errno
import
logging
__all__
=
[
"HDFSClient"
,
"multi_download"
]
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
)
_logger
=
logging
.
getLogger
(
"hdfs_utils"
)
_logger
.
setLevel
(
logging
.
INFO
)
class
HDFSClient
(
object
):
def
__init__
(
self
,
hadoop_home
,
configs
):
self
.
pre_commands
=
[]
hadoop_bin
=
'%s/bin/hadoop'
%
hadoop_home
self
.
pre_commands
.
append
(
hadoop_bin
)
dfs
=
'fs'
self
.
pre_commands
.
append
(
dfs
)
for
k
,
v
in
configs
.
iteritems
():
config_command
=
'-D%s=%s'
%
(
k
,
v
)
self
.
pre_commands
.
append
(
config_command
)
def
__run_hdfs_cmd
(
self
,
commands
,
retry_times
=
5
):
whole_commands
=
copy
.
deepcopy
(
self
.
pre_commands
)
whole_commands
.
extend
(
commands
)
print
(
'Running system command: {0}'
.
format
(
' '
.
join
(
whole_commands
)))
ret_code
=
0
ret_out
=
None
ret_err
=
None
for
x
in
range
(
retry_times
+
1
):
proc
=
subprocess
.
Popen
(
whole_commands
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
(
output
,
errors
)
=
proc
.
communicate
()
ret_code
,
ret_out
,
ret_err
=
proc
.
returncode
,
output
,
errors
if
ret_code
:
_logger
.
warn
(
'Times: %d, Error running command: %s. Return code: %d, Error: %s'
%
(
x
,
' '
.
join
(
whole_commands
),
proc
.
returncode
,
errors
))
else
:
break
return
ret_code
,
ret_out
,
ret_err
def
upload
(
self
,
hdfs_path
,
local_path
,
overwrite
=
False
,
retry_times
=
5
):
"""
upload the local file to hdfs
args:
local_file_path: the local file path
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
return:
True or False
"""
assert
hdfs_path
is
not
None
assert
local_path
is
not
None
and
os
.
path
.
exists
(
local_path
)
if
os
.
path
.
isdir
(
local_path
):
_logger
.
warn
(
"The Local path: {} is dir and I will support it later, return"
.
format
(
local_path
))
return
base
=
os
.
path
.
basename
(
local_path
)
if
not
self
.
is_exist
(
hdfs_path
):
self
.
makedirs
(
hdfs_path
)
else
:
if
self
.
is_exist
(
os
.
path
.
join
(
hdfs_path
,
base
)):
if
overwrite
:
_logger
.
error
(
"The HDFS path: {} is exist and overwrite is True, delete it"
.
format
(
hdfs_path
))
self
.
delete
(
hdfs_path
)
else
:
_logger
.
error
(
"The HDFS path: {} is exist and overwrite is False, return"
.
format
(
hdfs_path
))
return
False
put_commands
=
[
"-put"
,
local_path
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
put_commands
,
retry_times
)
if
returncode
:
_logger
.
error
(
"Put local path: {} to HDFS path: {} failed"
.
format
(
local_path
,
hdfs_path
))
return
False
else
:
_logger
.
info
(
"Put local path: {} to HDFS path: {} successfully"
.
format
(
local_path
,
hdfs_path
))
return
True
def
download
(
self
,
hdfs_path
,
local_path
,
overwrite
=
False
,
unzip
=
False
):
"""
download from hdfs
args:
local_file_path: the local file path
remote_file_path: remote dir on hdfs
return:
True or False
"""
_logger
.
info
(
'Downloading %r to %r.'
,
hdfs_path
,
local_path
)
_logger
.
info
(
'Download of %s to %r complete.'
,
hdfs_path
,
local_path
)
if
not
self
.
is_exist
(
hdfs_path
):
print
(
"HDFS path: {} do not exist"
.
format
(
hdfs_path
))
return
False
if
self
.
is_dir
(
hdfs_path
):
_logger
.
error
(
"The HDFS path: {} is dir and I will support it later, return"
.
format
(
hdfs_path
))
if
os
.
path
.
exists
(
local_path
):
base
=
os
.
path
.
basename
(
hdfs_path
)
local_file
=
os
.
path
.
join
(
local_path
,
base
)
if
os
.
path
.
exists
(
local_file
):
if
overwrite
:
os
.
remove
(
local_file
)
else
:
_logger
.
error
(
"The Local path: {} is exist and overwrite is False, return"
.
format
(
local_file
))
return
False
self
.
make_local_dirs
(
local_path
)
download_commands
=
[
"-get"
,
hdfs_path
,
local_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
download_commands
)
if
returncode
:
_logger
.
error
(
"Get local path: {} from HDFS path: {} failed"
.
format
(
local_path
,
hdfs_path
))
return
False
else
:
_logger
.
info
(
"Get local path: {} from HDFS path: {} successfully"
.
format
(
local_path
,
hdfs_path
))
return
True
def
is_exist
(
self
,
hdfs_path
=
None
):
"""
whether the remote hdfs path exists?
args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
True or False
"""
exist_cmd
=
[
'-test'
,
'-e'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
exist_cmd
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS is_exist HDFS path: {} failed"
.
format
(
hdfs_path
))
return
False
else
:
_logger
.
info
(
"HDFS is_exist HDFS path: {} successfully"
.
format
(
hdfs_path
))
return
True
def
is_dir
(
self
,
hdfs_path
=
None
):
"""
whether the remote hdfs path exists?
args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
True or False
"""
if
not
self
.
is_exist
(
hdfs_path
):
return
False
dir_cmd
=
[
'-test'
,
'-d'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
dir_cmd
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS path: {} failed is not a directory"
.
format
(
hdfs_path
))
return
False
else
:
_logger
.
info
(
"HDFS path: {} successfully is a directory"
.
format
(
hdfs_path
))
return
True
def
delete
(
self
,
hdfs_path
):
"""Remove a file or directory from HDFS.
:param hdfs_path: HDFS path.
:param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`.
"""
_logger
.
info
(
'Deleting %r.'
,
hdfs_path
)
if
not
self
.
is_exist
(
hdfs_path
):
_logger
.
warn
(
"HDFS path: {} do not exist"
.
format
(
hdfs_path
))
return
True
if
self
.
is_dir
(
hdfs_path
):
del_cmd
=
[
'-rmr'
,
hdfs_path
]
else
:
del_cmd
=
[
'-rm'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
del_cmd
,
retry_times
=
0
)
if
returncode
:
_logger
.
error
(
"HDFS path: {} delete files failure"
.
format
(
hdfs_path
))
return
False
else
:
_logger
.
info
(
"HDFS path: {} delete files successfully"
.
format
(
hdfs_path
))
return
True
def
rename
(
self
,
hdfs_src_path
,
hdfs_dst_path
,
overwrite
=
False
):
"""Move a file or folder.
:param hdfs_src_path: Source path.
:param hdfs_dst_path: Destination path. If the path already exists and is
a directory, the source will be moved into it. If the path exists and is
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
"""
assert
hdfs_src_path
is
not
None
assert
hdfs_dst_path
is
not
None
if
not
self
.
is_exist
(
hdfs_src_path
):
_logger
.
info
(
"HDFS path do not exist: {}"
.
format
(
hdfs_src_path
))
if
self
.
is_exist
(
hdfs_dst_path
)
and
not
overwrite
:
_logger
.
error
(
"HDFS path is exist: {} and overwrite=False"
.
format
(
hdfs_dst_path
))
rename_command
=
[
'-mv'
,
hdfs_src_path
,
hdfs_dst_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
rename_command
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS rename path: {} to {} failed"
.
format
(
hdfs_src_path
,
hdfs_dst_path
))
return
False
else
:
_logger
.
info
(
"HDFS rename path: {} to {} successfully"
.
format
(
hdfs_src_path
,
hdfs_dst_path
))
return
True
@
staticmethod
def
make_local_dirs
(
local_path
):
try
:
os
.
makedirs
(
local_path
)
except
OSError
as
e
:
if
e
.
errno
!=
errno
.
EEXIST
:
raise
def
makedirs
(
self
,
hdfs_path
):
"""Create a remote directory, recursively if necessary.
:param hdfs_path: Remote path. Intermediate directories will be created
appropriately.
"""
_logger
.
info
(
'Creating directories to %r.'
,
hdfs_path
)
assert
hdfs_path
is
not
None
if
self
.
is_exist
(
hdfs_path
):
return
mkdirs_commands
=
[
'-mkdir'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
mkdirs_commands
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS mkdir path: {} failed"
.
format
(
hdfs_path
))
return
False
else
:
_logger
.
error
(
"HDFS mkdir path: {} successfully"
.
format
(
hdfs_path
))
return
True
def
ls
(
self
,
hdfs_path
):
assert
hdfs_path
is
not
None
if
not
self
.
is_exist
(
hdfs_path
):
return
[]
ls_commands
=
[
'-ls'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
ls_commands
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS list path: {} failed"
.
format
(
hdfs_path
))
return
[]
else
:
_logger
.
info
(
"HDFS list path: {} successfully"
.
format
(
hdfs_path
))
ret_lines
=
[]
regex
=
re
.
compile
(
'\s+'
)
out_lines
=
output
.
strip
().
split
(
"
\n
"
)
for
line
in
out_lines
:
re_line
=
regex
.
split
(
line
)
if
len
(
re_line
)
==
8
:
ret_lines
.
append
(
re_line
[
7
])
return
ret_lines
def
lsr
(
self
,
hdfs_path
,
only_file
=
True
,
sort
=
True
):
def
sort_by_time
(
v1
,
v2
):
v1_time
=
datetime
.
strptime
(
v1
[
1
],
'%Y-%m-%d %H:%M'
)
v2_time
=
datetime
.
strptime
(
v2
[
1
],
'%Y-%m-%d %H:%M'
)
return
v1_time
>
v2_time
assert
hdfs_path
is
not
None
if
not
self
.
is_exist
(
hdfs_path
):
return
[]
ls_commands
=
[
'-lsr'
,
hdfs_path
]
returncode
,
output
,
errors
=
self
.
__run_hdfs_cmd
(
ls_commands
,
retry_times
=
1
)
if
returncode
:
_logger
.
error
(
"HDFS list all files: {} failed"
.
format
(
hdfs_path
))
return
[]
else
:
_logger
.
info
(
"HDFS list all files: {} successfully"
.
format
(
hdfs_path
))
lines
=
[]
regex
=
re
.
compile
(
'\s+'
)
out_lines
=
output
.
strip
().
split
(
"
\n
"
)
for
line
in
out_lines
:
re_line
=
regex
.
split
(
line
)
if
len
(
re_line
)
==
8
:
if
only_file
and
re_line
[
0
][
0
]
==
"d"
:
continue
else
:
lines
.
append
(
(
re_line
[
7
],
re_line
[
5
]
+
" "
+
re_line
[
6
]))
if
sort
:
sorted
(
lines
,
cmp
=
sort_by_time
)
ret_lines
=
[
ret
[
0
]
for
ret
in
lines
]
return
ret_lines
def
multi_upload
(
client
,
hdfs_path
,
local_path
,
multi_processes
=
5
,
overwrite
=
False
):
"""
:param overwrite: will overwrite hdfs file or not
:param multi_processes: the upload data process at the same time, default=5
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:return:
"""
def
__subprocess_upload
(
datas
):
for
data
in
datas
:
re_path
=
os
.
path
.
relpath
(
os
.
path
.
dirname
(
data
),
local_path
)
hdfs_re_path
=
os
.
path
.
join
(
hdfs_path
,
re_path
)
client
.
upload
(
hdfs_re_path
,
data
,
overwrite
,
retry_times
=
5
)
def
get_local_files
(
path
):
rlist
=
[]
if
not
os
.
path
.
isdir
(
path
):
return
rlist
for
dirname
,
folder
,
files
in
os
.
walk
(
path
):
for
i
in
files
:
t
=
os
.
path
.
join
(
dirname
,
i
)
rlist
.
append
(
t
)
return
rlist
assert
isinstance
(
client
,
HDFSClient
)
all_files
=
get_local_files
(
local_path
)
if
not
all_files
:
_logger
.
info
(
"there are nothing need to upload, exit"
)
return
_logger
.
info
(
"Start {} multi process to upload datas"
.
format
(
multi_processes
))
procs
=
[]
for
i
in
range
(
multi_processes
):
process_datas
=
all_files
[
i
::
multi_processes
]
p
=
multiprocessing
.
Process
(
target
=
__subprocess_upload
,
args
=
(
process_datas
,
))
procs
.
append
(
p
)
p
.
start
()
# complete the processes
for
proc
in
procs
:
proc
.
join
()
_logger
.
info
(
"Finish {} multi process to upload datas"
.
format
(
multi_processes
))
def
multi_download
(
client
,
hdfs_path
,
local_path
,
trainer_id
,
trainers
,
multi_processes
=
5
):
"""
multi_download
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param multi_processes: the download data process at the same time, default=5
:return: None
"""
def
__subprocess_download
(
datas
):
for
data
in
datas
:
re_path
=
os
.
path
.
relpath
(
os
.
path
.
dirname
(
data
),
hdfs_path
)
local_re_path
=
os
.
path
.
join
(
local_path
,
re_path
)
client
.
download
(
data
,
local_re_path
)
assert
isinstance
(
client
,
HDFSClient
)
client
.
make_local_dirs
(
local_path
)
_logger
.
info
(
"Make local dir {} successfully"
.
format
(
local_path
))
all_need_download
=
client
.
lsr
(
hdfs_path
,
sort
=
True
)
need_download
=
all_need_download
[
trainer_id
::
trainers
]
_logger
.
info
(
"Get {} files From all {} files need to be download from {}"
.
format
(
len
(
need_download
),
len
(
all_need_download
),
hdfs_path
))
_logger
.
info
(
"Start {} multi process to download datas"
.
format
(
multi_processes
))
procs
=
[]
for
i
in
range
(
multi_processes
):
process_datas
=
need_download
[
i
::
multi_processes
]
p
=
multiprocessing
.
Process
(
target
=
__subprocess_download
,
args
=
(
process_datas
,
))
procs
.
append
(
p
)
p
.
start
()
# complete the processes
for
proc
in
procs
:
proc
.
join
()
_logger
.
info
(
"Finish {} multi process to download datas"
.
format
(
multi_processes
))
local_downloads
=
[]
for
data
in
need_download
:
data_name
=
os
.
path
.
basename
(
data
)
re_path
=
os
.
path
.
relpath
(
os
.
path
.
dirname
(
data
),
hdfs_path
)
local_re_path
=
os
.
path
.
join
(
local_path
,
re_path
,
data_name
)
local_downloads
.
append
(
local_re_path
)
return
local_downloads
if
__name__
==
"__main__"
:
hadoop_home
=
"/home/client/hadoop-client/hadoop/"
configs
=
{
"fs.default.name"
:
"hdfs://xxx.hadoop.com:54310"
,
"hadoop.job.ugi"
:
"hello,hello123"
}
client
=
HDFSClient
(
hadoop_home
,
configs
)
client
.
ls
(
"/user/com/train-25"
)
files
=
client
.
lsr
(
"/user/com/train-25/models"
)
downloads
=
multi_download
(
client
,
"/user/com/train-25/model"
,
"/home/xx/data1"
,
1
,
5
,
multi_processes
=
5
)
multi_upload
(
client
,
"/user/com/train-25/model"
,
"/home/xx/data1"
)
python/paddle/fluid/layers/detection.py
浏览文件 @
b5c44fd4
...
...
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
clip
=
False
,
steps
=
[
0.0
,
0.0
],
offset
=
0.5
,
flatten_to_2d
=
False
,
name
=
None
):
"""
**Density Prior Box Operator**
...
...
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
height/weight of the input will be automatically calculated.
Default: [0., 0.]
offset(float): Prior boxes center offset. Default: 0.5
flatten_to_2d(bool): Whether to flatten output prior boxes and variance
to 2D shape, the second dim is 4. Default: False.
name(str): Name of the density prior box op. Default: None.
Returns:
tuple: A tuple with two Variable (boxes, variances)
boxes: the output density prior boxes of PriorBox.
The layout is [H, W, num_priors, 4].
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input,
num_priors is the total
box count of each position of input.
num_priors is the total box count of each position of input.
variances: the expanded variances of PriorBox.
The layout is [H, W, num_priors, 4].
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input
num_priors is the total
box count of each position of input
num_priors is the total box count of each position of input.
Examples:
...
...
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
box, var = fluid.layers.density_prior_box(
input=conv1,
image=images,
min_sizes=[100.],
max_sizes=[200.],
aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
densities=[3, 4],
fixed_sizes=[50., 60.],
fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
flip=True,
clip=True)
densities=[4, 2, 1],
fixed_sizes=[32.0, 64.0, 128.0],
fixed_ratios=[1.],
clip=True,
flatten_to_2d=True)
"""
helper
=
LayerHelper
(
"density_prior_box"
,
**
locals
())
dtype
=
helper
.
input_dtype
()
...
...
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
'step_w'
:
steps
[
0
],
'step_h'
:
steps
[
1
],
'offset'
:
offset
,
'densities'
:
densities
,
'fixed_sizes'
:
fixed_sizes
,
'fixed_ratios'
:
fixed_ratios
,
'flatten_to_2d'
:
flatten_to_2d
,
}
if
densities
is
not
None
and
len
(
densities
)
>
0
:
attrs
[
'densities'
]
=
densities
if
fixed_sizes
is
not
None
and
len
(
fixed_sizes
)
>
0
:
attrs
[
'fixed_sizes'
]
=
fixed_sizes
if
fixed_ratios
is
not
None
and
len
(
fixed_ratios
)
>
0
:
attrs
[
'fixed_ratios'
]
=
fixed_ratios
box
=
helper
.
create_variable_for_type_inference
(
dtype
)
var
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
b5c44fd4
...
...
@@ -347,10 +347,8 @@ def _copy_reader_create_op_(block, op):
return
new_op
if
os
.
name
!=
'nt'
:
@
templatedoc
(
op_type
=
'create_recordio_file_reader'
)
def
open_recordio_file
(
filename
,
@
templatedoc
(
op_type
=
'create_recordio_file_reader'
)
def
open_recordio_file
(
filename
,
shapes
,
lod_levels
,
dtypes
,
...
...
@@ -406,8 +404,8 @@ if os.name != 'nt':
startup_var
.
desc
.
set_dtypes
(
dtypes
)
startup_var
.
persistable
=
True
main_prog_var
=
_copy_reader_var_
(
default_main_program
().
current_block
(),
startup_var
)
main_prog_var
=
_copy_reader_var_
(
default_main_program
().
current_block
(),
startup_var
)
if
pass_num
>
1
:
main_prog_var
=
multi_pass
(
reader
=
main_prog_var
,
pass_num
=
pass_num
)
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
b5c44fd4
...
...
@@ -85,6 +85,7 @@ __all__ = [
'row_conv'
,
'multiplex'
,
'layer_norm'
,
'group_norm'
,
'softmax_with_cross_entropy'
,
'smooth_l1'
,
'one_hot'
,
...
...
@@ -343,10 +344,8 @@ def embedding(input,
return
tmp
if
os
.
name
!=
'nt'
:
@
templatedoc
(
op_type
=
"lstm"
)
def
dynamic_lstm
(
input
,
@
templatedoc
(
op_type
=
"lstm"
)
def
dynamic_lstm
(
input
,
size
,
h_0
=
None
,
c_0
=
None
,
...
...
@@ -963,10 +962,8 @@ def linear_chain_crf(input, label, param_attr=None):
return
log_likelihood
if
os
.
name
!=
'nt'
:
@
templatedoc
()
def
crf_decoding
(
input
,
param_attr
,
label
=
None
):
@
templatedoc
()
def
crf_decoding
(
input
,
param_attr
,
label
=
None
):
"""
${comment}
...
...
@@ -992,11 +989,9 @@ if os.name != 'nt':
dtype
=
helper
.
input_dtype
())
helper
.
append_op
(
type
=
'crf_decoding'
,
inputs
=
{
"Emission"
:
[
input
],
inputs
=
{
"Emission"
:
[
input
],
"Transition"
:
transition
,
"Label"
:
label
},
"Label"
:
label
},
outputs
=
{
"ViterbiPath"
:
[
viterbi_path
]})
return
viterbi_path
...
...
@@ -2139,11 +2134,16 @@ def pool2d(input,
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
pool_size (int): The side length of pooling windows. All pooling
windows are squares with pool_size on a side.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be a square of an int.
pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
it must contain two integers, (pool_stride_Height, pool_stride_Width).
Otherwise, the pool stride size will be a square of an int.
pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
Otherwise, the pool padding size will be a square of an int.
global_pooling (bool): ${global_pooling_comment}
use_cudnn (bool): ${use_cudnn_comment}
ceil_mode (bool): ${ceil_mode_comment}
...
...
@@ -2553,6 +2553,84 @@ def layer_norm(input,
return
helper
.
append_activation
(
layer_norm_out
)
@
templatedoc
()
def
group_norm
(
input
,
groups
,
epsilon
=
1e-05
,
param_attr
=
None
,
bias_attr
=
None
,
act
=
None
,
data_layout
=
'NCHW'
,
name
=
None
):
"""
**Group Normalization Layer**
Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
Args:
input(Variable): The input tensor variable.
groups(int): The number of groups that divided from channels.
epsilon(float): The small value added to the variance to prevent
division by zero.
param_attr(ParamAttr|None): The parameter attribute for the learnable
scale :math:`g`. If it is set to False, no scale will be added to the output units.
If it is set to None, the bias is initialized one. Default: None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
act(str): Activation to be applied to the output of group normalizaiton.
data_layout(string|NCHW): Only NCHW is supported.
name (str): The name of this layer. It is optional.
Returns:
Variable: A tensor variable which is the result after applying group normalization on the input.
Examples:
>>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
>>> dtype='float32')
>>> x = fluid.layers.group_norm(input=data, groups=4)
"""
helper
=
LayerHelper
(
'group_norm'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
# create intput and parameters
inputs
=
{
'X'
:
input
}
input_shape
=
input
.
shape
if
data_layout
!=
'NCHW'
:
raise
ValueError
(
"unsupported data layout:"
+
data_layout
)
param_shape
=
[
input_shape
[
1
]]
if
param_attr
:
scale
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
default_initializer
=
Constant
(
1.0
))
inputs
[
'Scale'
]
=
scale
if
bias_attr
:
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
is_bias
=
True
)
inputs
[
'Bias'
]
=
bias
# create output
mean_out
=
helper
.
create_tmp_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
variance_out
=
helper
.
create_tmp_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
group_norm_out
=
helper
.
create_tmp_variable
(
dtype
)
helper
.
append_op
(
type
=
"group_norm"
,
inputs
=
inputs
,
outputs
=
{
"Y"
:
group_norm_out
,
"Mean"
:
mean_out
,
"Variance"
:
variance_out
,
},
attrs
=
{
"epsilon"
:
epsilon
,
"groups"
:
groups
})
return
helper
.
append_activation
(
group_norm_out
)
def
conv2d_transpose
(
input
,
num_filters
,
output_size
=
None
,
...
...
@@ -5593,14 +5671,8 @@ def label_smooth(label,
return
smooth_label
if
os
.
name
!=
'nt'
:
@
templatedoc
()
def
roi_pool
(
input
,
rois
,
pooled_height
=
1
,
pooled_width
=
1
,
spatial_scale
=
1.0
):
@
templatedoc
()
def
roi_pool
(
input
,
rois
,
pooled_height
=
1
,
pooled_width
=
1
,
spatial_scale
=
1.0
):
"""
${comment}
...
...
@@ -5788,7 +5860,7 @@ def image_resize(input,
Examples:
.. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12])
out = fluid.layers.image_resize(input, out_shape=[12, 12]
, resample="NEAREST"
)
"""
resample_methods
=
{
'BILINEAR'
:
'bilinear'
,
...
...
@@ -5891,6 +5963,11 @@ def resize_bilinear(input,
Returns:
${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
,
actual_shape
)
...
...
@@ -5937,6 +6014,11 @@ def resize_nearest(input,
Returns:
${out_comment}.
Examples:
.. code-block:: python
out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'NEAREST'
,
actual_shape
)
...
...
@@ -7692,6 +7774,15 @@ def logical_and(x, y, out=None, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
left = fluid.layers.data(
name='left', shape=[1], dtype='int32')
right = fluid.layers.data(
name='right', shape=[1], dtype='int32')
result = fluid.layers.logical_and(x=left, y=right)
"""
return
_logical_op
(
...
...
@@ -7711,6 +7802,15 @@ def logical_or(x, y, out=None, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
left = fluid.layers.data(
name='left', shape=[1], dtype='int32')
right = fluid.layers.data(
name='right', shape=[1], dtype='int32')
result = fluid.layers.logical_or(x=left, y=right)
"""
return
_logical_op
(
...
...
@@ -7730,6 +7830,15 @@ def logical_xor(x, y, out=None, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
left = fluid.layers.data(
name='left', shape=[1], dtype='int32')
right = fluid.layers.data(
name='right', shape=[1], dtype='int32')
result = fluid.layers.logical_xor(x=left, y=right)
"""
return
_logical_op
(
...
...
@@ -7748,6 +7857,13 @@ def logical_not(x, out=None, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
left = fluid.layers.data(
name='left', shape=[1], dtype='int32')
result = fluid.layers.logical_not(x=left)
"""
return
_logical_op
(
...
...
@@ -7767,6 +7883,13 @@ def clip(x, min, max, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
input = fluid.layers.data(
name='data', shape=[1], dtype='float32')
reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
"""
helper
=
LayerHelper
(
"clip"
,
**
locals
())
...
...
@@ -7799,6 +7922,13 @@ def clip_by_norm(x, max_norm, name=None):
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
input = fluid.layers.data(
name='data', shape=[1], dtype='float32')
reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
"""
helper
=
LayerHelper
(
"clip_by_norm"
,
**
locals
())
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
b5c44fd4
...
...
@@ -100,12 +100,12 @@ Examples:
>>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
"""
if
os
.
name
!=
'nt'
:
__all__
+=
[
'cumsum'
]
__all__
+=
[
'cumsum'
]
_cum_sum_
=
generate_layer_fn
(
'cumsum'
)
_cum_sum_
=
generate_layer_fn
(
'cumsum'
)
def
cumsum
(
x
,
axis
=
None
,
exclusive
=
None
,
reverse
=
None
):
def
cumsum
(
x
,
axis
=
None
,
exclusive
=
None
,
reverse
=
None
):
locals_var
=
locals
().
keys
()
kwargs
=
dict
()
for
name
in
locals_var
:
...
...
@@ -114,12 +114,13 @@ if os.name != 'nt':
kwargs
[
name
]
=
val
return
_cum_sum_
(
**
kwargs
)
cumsum
.
__doc__
=
_cum_sum_
.
__doc__
+
"""
Examples:
cumsum
.
__doc__
=
_cum_sum_
.
__doc__
+
"""
Examples:
>>> data = fluid.layers.data(name="input", shape=[32, 784])
>>> result = fluid.layers.cumsum(data, axis=0)
"""
"""
__all__
+=
[
'thresholded_relu'
]
...
...
python/paddle/fluid/nets.py
浏览文件 @
b5c44fd4
...
...
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
filter_size
,
param_attr
=
None
,
act
=
"sigmoid"
,
pool_type
=
"max"
):
pool_type
=
"max"
,
bias_attr
=
None
):
"""
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
...
...
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
Default :math:`max`.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
Return:
Variable: The final result after Sequence Convolution and Pooling.
...
...
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
pool_out
=
layers
.
sequence_pool
(
input
=
conv_out
,
pool_type
=
pool_type
)
...
...
python/paddle/fluid/tests/book/test_image_classification.py
浏览文件 @
b5c44fd4
...
...
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
assert
len
(
results
[
0
])
==
len
(
transpiler_results
[
0
])
for
i
in
range
(
len
(
results
[
0
])):
np
.
testing
.
assert_almost_equal
(
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
5
)
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
4
)
print
(
"infer results: "
,
results
[
0
])
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
b5c44fd4
...
...
@@ -112,6 +112,8 @@ class TestDetection(unittest.TestCase):
class
TestPriorBox
(
unittest
.
TestCase
):
def
test_prior_box
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
...
...
@@ -130,6 +132,8 @@ class TestPriorBox(unittest.TestCase):
class
TestDensityPriorBox
(
unittest
.
TestCase
):
def
test_density_prior_box
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
...
...
@@ -143,7 +147,7 @@ class TestDensityPriorBox(unittest.TestCase):
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
3
]
==
4
assert
box
.
shape
[
-
1
]
==
4
class
TestAnchorGenerator
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
b5c44fd4
...
...
@@ -23,6 +23,12 @@ if(NOT WITH_DISTRIBUTE)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_text_classification
)
endif
(
NOT WITH_DISTRIBUTE
)
if
(
NOT
${
WITH_GPU
}
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
elseif
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
endif
()
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
list
(
REMOVE_ITEM TEST_OPS test_modified_huber_loss_op
)
# FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
list
(
REMOVE_ITEM TEST_OPS test_lstm_unit_op
)
# # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
...
...
@@ -45,6 +51,10 @@ if(APPLE)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext
)
list
(
REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass
)
endif
()
if
(
NOT WITH_MKLML
)
# this op is not support on openblas
list
(
REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op
)
endif
()
function
(
py_test_modules TARGET_NAME
)
if
(
WITH_TESTING
)
...
...
@@ -71,10 +81,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list
(
REMOVE_ITEM TEST_OPS test_dist_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_transformer
)
list
(
REMOVE_ITEM TEST_OPS test_image_classification_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_interpolate_op
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
endforeach
(
TEST_OP
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_interpolate_op MODULES test_interpolate_op SERIAL
)
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
b5c44fd4
...
...
@@ -362,7 +362,9 @@ class OpTest(unittest.TestCase):
else
:
return
[]
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
cpu_only
=
self
.
_cpu_only
if
hasattr
(
self
,
'_cpu_only'
)
else
False
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
)
\
and
not
cpu_only
:
places
.
append
(
core
.
CUDAPlace
(
0
))
return
places
...
...
@@ -379,7 +381,7 @@ class OpTest(unittest.TestCase):
outs
.
sort
(
key
=
len
)
checker
(
outs
)
def
_
_
assert_is_close
(
self
,
numeric_grads
,
analytic_grads
,
names
,
def
_assert_is_close
(
self
,
numeric_grads
,
analytic_grads
,
names
,
max_relative_error
,
msg_prefix
):
for
a
,
b
,
name
in
six
.
moves
.
zip
(
numeric_grads
,
analytic_grads
,
names
):
...
...
@@ -449,7 +451,7 @@ class OpTest(unittest.TestCase):
analytic_grads
=
self
.
_get_gradient
(
inputs_to_check
,
place
,
output_names
,
no_grad_set
)
self
.
_
_
assert_is_close
(
numeric_grads
,
analytic_grads
,
inputs_to_check
,
self
.
_assert_is_close
(
numeric_grads
,
analytic_grads
,
inputs_to_check
,
max_relative_error
,
"Gradient Check On %s"
%
str
(
place
))
...
...
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
浏览文件 @
b5c44fd4
...
...
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
'offset'
:
self
.
offset
,
'densities'
:
self
.
densities
,
'fixed_sizes'
:
self
.
fixed_sizes
,
'fixed_ratios'
:
self
.
fixed_ratios
'fixed_ratios'
:
self
.
fixed_ratios
,
'flatten_to_2d'
:
self
.
flatten_to_2d
}
self
.
outputs
=
{
'Boxes'
:
self
.
out_boxes
,
'Variances'
:
self
.
out_var
}
...
...
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
self
.
set_data
()
def
set_density
(
self
):
self
.
densities
=
[]
self
.
fixed_sizes
=
[]
self
.
fixed_ratios
=
[]
self
.
densities
=
[
4
,
2
,
1
]
self
.
fixed_sizes
=
[
32.0
,
64.0
,
128.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
17
self
.
layer_h
=
17
self
.
image_w
=
533
self
.
image_h
=
533
self
.
flatten_to_2d
=
False
def
init_test_params
(
self
):
self
.
layer_w
=
32
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
set_density
()
self
.
step_w
=
float
(
self
.
image_w
)
/
float
(
self
.
layer_w
)
self
.
step_h
=
float
(
self
.
image_h
)
/
float
(
self
.
layer_h
)
...
...
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
self
.
variances
=
[
0.1
,
0.1
,
0.2
,
0.2
]
self
.
variances
=
np
.
array
(
self
.
variances
,
dtype
=
np
.
float
).
flatten
()
self
.
set_density
()
self
.
clip
=
True
self
.
num_priors
=
0
if
len
(
self
.
fixed_sizes
)
>
0
and
len
(
self
.
densities
)
>
0
:
...
...
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
(
self
.
layer_h
,
self
.
layer_w
,
self
.
num_priors
,
1
))
self
.
out_boxes
=
out_boxes
.
astype
(
'float32'
)
self
.
out_var
=
out_var
.
astype
(
'float32'
)
if
self
.
flatten_to_2d
:
self
.
out_boxes
=
self
.
out_boxes
.
reshape
((
-
1
,
4
))
self
.
out_var
=
self
.
out_var
.
reshape
((
-
1
,
4
))
class
TestDensityPriorBox
(
TestDensityPriorBoxOp
):
...
...
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
self
.
densities
=
[
3
,
4
]
self
.
fixed_sizes
=
[
1.0
,
2.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
32
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
flatten_to_2d
=
True
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
0 → 100644
浏览文件 @
b5c44fd4
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
test_elementwise_mul_op
import
*
class
TestElementwiseMulMKLDNNOp_BroadcastNCHW16c
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
x
=
x
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
self
.
y
=
np
.
random
.
rand
(
1
,
16
).
astype
(
self
.
dtype
)
self
.
out
=
x
*
self
.
y
.
reshape
(
1
,
16
,
1
,
1
)
self
.
out
=
self
.
out
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_BroadcastNCHW16c
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw16c"
self
.
attrs
[
"y_data_format"
]
=
"nc"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
@
unittest
.
skip
(
"Not implemented yet."
)
# TODO(mgallus): enable when implemented.
class
TestElementwiseMulMKLDNNOp_BroadcastNCHW8c
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
x
=
np
.
random
.
rand
(
1
,
8
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
x
=
x
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
8
,
2
,
2
)
self
.
y
=
np
.
random
.
rand
(
1
,
8
).
astype
(
self
.
dtype
)
self
.
out
=
x
*
self
.
y
.
reshape
(
1
,
8
,
1
,
1
)
self
.
out
=
self
.
out
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
8
,
2
,
2
)
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_BroadcastNCHW8c
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw8c"
self
.
attrs
[
"y_data_format"
]
=
"nc"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackNCHW
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
1
,
16
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
*
self
.
y
.
reshape
(
1
,
16
,
1
,
1
)
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackNCHW16C
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
x
=
x
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
y
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
y
=
y
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
self
.
out
=
self
.
x
*
self
.
y
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_FallbackNCHW16C
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw16c"
self
.
attrs
[
"y_data_format"
]
=
"nchw16c"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackNoReorders
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
x
=
x
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
y
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
y
=
y
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
self
.
out
=
self
.
x
*
self
.
y
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_FallbackNoReorders
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw16c"
self
.
attrs
[
"y_data_format"
]
=
"nchw16c"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackWithReorder1
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
y
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
y
=
y
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
self
.
out
=
self
.
x
*
y
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_FallbackWithReorder1
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw"
self
.
attrs
[
"y_data_format"
]
=
"nchw16c"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackWithReorder2
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
self
.
y
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
x
=
np
.
random
.
rand
(
1
,
16
,
2
,
2
).
astype
(
self
.
dtype
)
self
.
x
=
x
.
transpose
(
0
,
2
,
3
,
1
).
reshape
(
1
,
16
,
2
,
2
)
self
.
out
=
x
*
self
.
y
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_FallbackWithReorder2
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nchw16c"
self
.
attrs
[
"y_data_format"
]
=
"nchw"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestElementwiseMulMKLDNNOp_FallbackNoReorders2
(
ElementwiseMulOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
1
,
16
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
1
,
16
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
*
self
.
y
def
setUp
(
self
):
super
(
TestElementwiseMulMKLDNNOp_FallbackNoReorders2
,
self
).
setUp
()
self
.
attrs
[
"x_data_format"
]
=
"nc"
self
.
attrs
[
"y_data_format"
]
=
"nc"
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_axis
(
self
):
self
.
axis
=
0
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
浏览文件 @
b5c44fd4
...
...
@@ -21,13 +21,24 @@ from paddle.fluid.op import Operator
class
ElementwiseMulOp
(
OpTest
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
False
def
setUp
(
self
):
self
.
op_type
=
"elementwise_mul"
self
.
dtype
=
np
.
float32
self
.
axis
=
-
1
self
.
init_dtype
()
self
.
init_input_output
()
self
.
init_kernel_type
()
self
.
init_axis
()
self
.
inputs
=
{
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float64"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float64"
)
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
self
.
x
),
'Y'
:
OpTest
.
np_dtype_to_fluid_dtype
(
self
.
y
)
}
self
.
outputs
=
{
'Out'
:
np
.
multiply
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'Y'
])}
self
.
outputs
=
{
'Out'
:
self
.
out
}
self
.
attrs
=
{
'axis'
:
self
.
axis
,
'use_mkldnn'
:
self
.
use_mkldnn
}
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -41,6 +52,17 @@ class ElementwiseMulOp(OpTest):
def
test_check_grad_ingore_y
(
self
):
self
.
check_grad
([
'X'
],
'Out'
,
no_grad_set
=
set
(
'Y'
))
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
self
.
dtype
)
self
.
out
=
np
.
multiply
(
self
.
x
,
self
.
y
)
def
init_dtype
(
self
):
pass
def
init_axis
(
self
):
pass
class
TestElementwiseMulOp_scalar
(
ElementwiseMulOp
):
def
setUp
(
self
):
...
...
@@ -63,17 +85,13 @@ class TestElementwiseMulOp_Vector(ElementwiseMulOp):
class
TestElementwiseMulOp_broadcast_0
(
ElementwiseMulOp
):
def
setUp
(
self
):
self
.
op_type
=
"elementwise_mul"
self
.
inputs
=
{
'X'
:
np
.
random
.
rand
(
2
,
3
,
4
).
astype
(
np
.
float64
),
'Y'
:
np
.
random
.
rand
(
2
).
astype
(
np
.
float64
)
}
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
2
,
3
,
4
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
2
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
*
self
.
y
.
reshape
(
2
,
1
,
1
)
self
.
attrs
=
{
'axis'
:
0
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
*
self
.
inputs
[
'Y'
].
reshape
(
2
,
1
,
1
)
}
def
init_axis
(
self
):
self
.
axis
=
0
class
TestElementwiseMulOp_broadcast_1
(
ElementwiseMulOp
):
...
...
python/paddle/fluid/tests/unittests/test_group_norm_op.py
0 → 100644
浏览文件 @
b5c44fd4
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
operator
import
mul
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
op_test
import
OpTest
from
testsuite
import
create_op
def
group_norm_naive
(
x
,
scale
,
bias
,
epsilon
,
groups
):
N
,
C
,
H
,
W
=
x
.
shape
G
=
groups
x
=
x
.
reshape
((
N
*
G
,
-
1
))
mean
=
np
.
mean
(
x
,
axis
=
1
,
keepdims
=
True
)
var
=
np
.
var
(
x
,
axis
=
1
,
keepdims
=
True
)
output
=
(
x
-
mean
)
/
np
.
sqrt
(
var
+
epsilon
)
output
=
output
.
reshape
((
N
,
C
,
H
,
W
))
*
scale
.
reshape
(
(
-
1
,
1
,
1
))
+
bias
.
reshape
((
-
1
,
1
,
1
))
return
output
,
mean
.
reshape
((
N
,
G
)),
var
.
reshape
((
N
,
G
))
class
TestGroupNormOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"group_norm"
self
.
data_format
=
"NCHW"
self
.
dtype
=
np
.
float32
self
.
shape
=
(
2
,
4
,
3
,
3
)
self
.
attrs
=
{
'epsilon'
:
1e-5
,
'groups'
:
2
}
self
.
compare_between_place
=
False
self
.
init_test_case
()
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
scale
=
np
.
random
.
random
([
self
.
shape
[
1
]]).
astype
(
self
.
dtype
)
bias
=
np
.
random
.
random
([
self
.
shape
[
1
]]).
astype
(
self
.
dtype
)
output
,
mean
,
var
=
group_norm_naive
(
input
,
scale
,
bias
,
self
.
attrs
[
'epsilon'
],
self
.
attrs
[
'groups'
])
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
'Scale'
:
OpTest
.
np_dtype_to_fluid_dtype
(
scale
),
'Bias'
:
OpTest
.
np_dtype_to_fluid_dtype
(
bias
)
}
self
.
outputs
=
{
'Y'
:
output
,
'Mean'
:
mean
,
'Variance'
:
var
}
def
test_check_output
(
self
):
atol
=
1e-4
place
=
core
.
CPUPlace
()
self
.
check_output_with_place
(
place
,
atol
=
atol
)
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
atol
=
atol
)
def
do_compare_between_place
(
self
):
if
not
core
.
is_compiled_with_cuda
():
return
place
=
core
.
CPUPlace
()
place2
=
core
.
CUDAPlace
(
0
)
self
.
scope
=
core
.
Scope
()
op_inputs
=
self
.
inputs
if
hasattr
(
self
,
"inputs"
)
else
dict
()
op_outputs
=
self
.
outputs
if
hasattr
(
self
,
"outputs"
)
else
dict
()
op_attrs
=
self
.
attrs
if
hasattr
(
self
,
"attrs"
)
else
dict
()
self
.
op
=
create_op
(
self
.
scope
,
self
.
op_type
,
op_inputs
,
op_outputs
,
op_attrs
)
inputs_to_check
=
set
([
'X'
,
'Scale'
,
'Bias'
])
output_names
=
'Y'
cpu_grads
=
self
.
_get_gradient
(
inputs_to_check
,
place
,
output_names
,
None
)
gpu_grads
=
self
.
_get_gradient
(
inputs_to_check
,
place2
,
output_names
,
None
)
self
.
_assert_is_close
(
cpu_grads
,
gpu_grads
,
inputs_to_check
,
0.005
,
"Gradient Check On %s"
%
str
(
place
))
def
test_check_grad
(
self
):
if
self
.
compare_between_place
:
self
.
do_compare_between_place
()
return
place
=
core
.
CPUPlace
()
self
.
check_grad_with_place
(
place
,
set
([
'X'
,
'Scale'
,
'Bias'
]),
'Y'
,
max_relative_error
=
0.01
)
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_grad_with_place
(
place
,
set
([
'X'
,
'Scale'
,
'Bias'
]),
'Y'
,
max_relative_error
=
0.01
)
def
init_test_case
(
self
):
pass
class
TestGroupNormOp1
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
attrs
[
'groups'
]
=
1
class
TestGroupNormOp2
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
attrs
[
'groups'
]
=
4
class
TestGroupNormOpBigEps1
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
attrs
[
'groups'
]
=
1
self
.
attrs
[
'epsilon'
]
=
0.5
class
TestGroupNormOpBigEps2
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
attrs
[
'groups'
]
=
4
self
.
attrs
[
'epsilon'
]
=
0.5
class
TestGroupNormOpBigEps3
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
attrs
[
'epsilon'
]
=
0.5
class
TestGroupNormOpLargeData
(
TestGroupNormOp
):
def
init_test_case
(
self
):
self
.
shape
=
(
2
,
32
,
64
,
64
)
self
.
attrs
[
'groups'
]
=
8
self
.
compare_between_place
=
True
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
b5c44fd4
...
...
@@ -202,6 +202,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
layers
.
sequence_unpad
(
x
=
x
,
length
=
length
))
print
(
str
(
program
))
def
test_pool2d
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
self
.
assertIsNotNone
(
layers
.
pool2d
(
x
,
pool_size
=
[
5
,
3
],
pool_stride
=
[
1
,
2
],
pool_padding
=
(
2
,
1
)))
def
test_lstm_unit
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
浏览文件 @
b5c44fd4
...
...
@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
lod
.
append
(
nmsed_num
)
if
nmsed_num
==
0
:
continue
tmp_det_out
=
[]
for
c
,
indices
in
nmsed_outs
.
items
():
for
idx
in
indices
:
xmin
,
ymin
,
xmax
,
ymax
=
boxes
[
n
][
idx
][:]
det_outs
.
append
([
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
tmp_det_out
.
append
(
[
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
sorted_det_out
=
sorted
(
tmp_det_out
,
key
=
lambda
tup
:
tup
[
0
],
reverse
=
False
)
det_outs
.
extend
(
sorted_det_out
)
return
det_outs
,
lod
...
...
python/requirements.txt
浏览文件 @
b5c44fd4
requests==2.9.2
numpy>=1.12
,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
numpy>=1.12
protobuf==3.1
recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
b5c44fd4
...
...
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
bash build_scripts/install_nccl2.sh && rm -r
f
build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
...
...
@@ -36,17 +36,21 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
tar xzf protobuf-cpp-3.1.0.tar.gz && \
cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
RUN wget
-O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python
/requirements.txt
RUN wget
https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root
/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
go get github.com/Masterminds/glide && \
rm -rf /root/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
...
...
tools/manylinux1/build_scripts/build.sh
浏览文件 @
b5c44fd4
...
...
@@ -9,12 +9,12 @@ set -ex
# remove others to expedite build and reduce docker image size. The original
# manylinux docker image project builds many python versions.
# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
CPYTHON_VERSIONS
=
"
2.7.11 3.5.
1"
CPYTHON_VERSIONS
=
"
3.7.0 3.6.0 3.5.1 2.7.1
1"
# openssl version to build, with expected sha256 hash of .tar.gz
# archive
OPENSSL_ROOT
=
openssl-1.
0.2l
OPENSSL_HASH
=
ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
OPENSSL_ROOT
=
openssl-1.
1.0i
OPENSSL_HASH
=
ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
EPEL_RPM_HASH
=
e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
DEVTOOLS_HASH
=
a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
PATCHELF_HASH
=
d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
...
...
@@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
# Dependencies for compiling Python that we want to remove from
# the final image after compiling Python
PYTHON_COMPILE_DEPS
=
"zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
PYTHON_COMPILE_DEPS
=
"zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel
libffi-devel
"
# Libraries that are allowed as part of the manylinux1 profile
MANYLINUX1_DEPS
=
"glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
...
...
@@ -61,7 +61,7 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \
wget
-q
https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz
&&
tar
xzf cmake-3.5.2.tar.gz
&&
\
cd
cmake-3.5.2
&&
./bootstrap
&&
\
make
-j
4
&&
make
install
&&
cd
..
&&
rm
cmake-3.5.2.tar.gz
make
-j
8
&&
make
install
&&
cd
..
&&
rm
cmake-3.5.2.tar.gz
# Install newest autoconf
...
...
@@ -77,11 +77,13 @@ mkdir -p /opt/python
build_cpythons
$CPYTHON_VERSIONS
PY35_BIN
=
/opt/python/cp35-cp35m/bin
PY36_BIN
=
/opt/python/cp36-cp36m/bin
PY37_BIN
=
/opt/python/cp37-cp37m/bin
# NOTE Since our custom manylinux image builds pythons with shared
# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
# python.
ORIGINAL_LD_LIBRARY_PATH
=
"
${
LD_LIBRARY_PATH
}
"
LD_LIBRARY_PATH
=
"
${
ORIGINAL_LD_LIBRARY_PATH
}
:
$(
dirname
${
PY35_BIN
}
)
/lib"
LD_LIBRARY_PATH
=
"
${
ORIGINAL_LD_LIBRARY_PATH
}
:
$(
dirname
${
PY35_BIN
}
)
/lib
:
$(
dirname
${
PY36_BIN
}
)
/lib:
$(
dirname
${
PY37_BIN
}
)
/lib
"
# Our openssl doesn't know how to find the system CA trust store
# (https://github.com/pypa/manylinux/issues/53)
...
...
@@ -119,9 +121,8 @@ ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
# final image
yum
-y
erase wireless-tools gtk2 libX11 hicolor-icon-theme
\
avahi freetype bitstream-vera-fonts
\
${
PYTHON_COMPILE_DEPS
}
>
/dev/null 2>&1
yum
-y
install
${
MANYLINUX1_DEPS
}
yum
-y
clean all
>
/dev/null 2>&1
${
PYTHON_COMPILE_DEPS
}
>
/dev/null 2>&1
||
true
yum
-y
install
${
MANYLINUX1_DEPS
}
&&
yum
-y
clean all
>
/dev/null 2>&1
||
true
yum list installed
# we don't need libpython*.a, and they're many megabytes
find /opt/_internal
-name
'*.a'
-print0
| xargs
-0
rm
-f
...
...
tools/manylinux1/build_scripts/build_utils.sh
浏览文件 @
b5c44fd4
...
...
@@ -50,11 +50,28 @@ function do_cpython_build {
mkdir
-p
${
prefix
}
/lib
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
if
[
$(
lex_pyver
$py_ver
)
-eq
$(
lex_pyver 3.6
)
]
;
then
wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
tar
-zxf
sqlite-autoconf-3250300.tar.gz
cd
sqlite-autoconf-3250300
./configure
--prefix
=
/usr/local
make
-j8
&&
make
install
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
fi
# NOTE --enable-shared for generating libpython shared library needed for
# linking of some of the nupic.core test executables.
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
make
-j2
>
/dev/null
make
install
>
/dev/null
if
[
$(
lex_pyver
$py_ver
)
-ge
$(
lex_pyver 3.7
)
]
;
then
# NOTE python 3.7 should be installed via make altinstall rather than
# make install, and we should specify the location of ssl
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--with-openssl
=
/usr/local/ssl
--enable-shared
$unicode_flags
>
/dev/null
make
-j8
>
/dev/null
make altinstall
>
/dev/null
else
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
-j8
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
install
>
/dev/null
fi
popd
echo
"ZZZ looking for libpython"
find /
-name
'libpython*.so*'
...
...
@@ -64,6 +81,9 @@ function do_cpython_build {
if
[
-e
${
prefix
}
/bin/python3
]
;
then
ln
-s
python3
${
prefix
}
/bin/python
fi
if
[
-e
${
prefix
}
/bin/python3.7
]
;
then
ln
-s
python3.7
${
prefix
}
/bin/python
fi
# NOTE Make libpython shared library visible to python calls below
LD_LIBRARY_PATH
=
"
${
prefix
}
/lib"
${
prefix
}
/bin/python get-pip.py
LD_LIBRARY_PATH
=
"
${
prefix
}
/lib"
${
prefix
}
/bin/pip
install
wheel
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录