Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2c4fcaa6
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2c4fcaa6
编写于
3月 07, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
上级
2a639d5c
40f1dd81
变更
125
显示空白变更内容
内联
并排
Showing
125 changed file
with
4345 addition
and
1337 deletion
+4345
-1337
Dockerfile
Dockerfile
+3
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-3
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+7
-5
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+7
-0
paddle/fluid/framework/inlined_stack.h
paddle/fluid/framework/inlined_stack.h
+2
-3
paddle/fluid/framework/ir/fuse_pass_base.h
paddle/fluid/framework/ir/fuse_pass_base.h
+5
-0
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+7
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+0
-41
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+2
-78
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-0
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+31
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+3
-0
paddle/fluid/inference/analysis/ir_pass_manager.h
paddle/fluid/inference/analysis/ir_pass_manager.h
+3
-0
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+193
-74
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
...uid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+9
-3
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+11
-0
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
...rence/analysis/passes/ir_params_sync_among_devices_pass.h
+1
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+3
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+35
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+9
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+3
-0
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+58
-2
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+1
-1
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+5
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+3
-1
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+21
-1
paddle/fluid/inference/engine.h
paddle/fluid/inference/engine.h
+0
-5
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+2
-19
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+1
-2
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+2
-2
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+62
-0
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+8
-11
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+51
-34
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+12
-131
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+39
-52
paddle/fluid/inference/tensorrt/helper.h
paddle/fluid/inference/tensorrt/helper.h
+29
-0
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+2
-1
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+7
-0
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+9
-5
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
.../fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+9
-2
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
...e/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+13
-7
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+14
-1
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+30
-13
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+6
-0
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+6
-3
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+9
-1
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+48
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+78
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+8
-1
paddle/fluid/inference/tensorrt/test_engine.cc
paddle/fluid/inference/tensorrt/test_engine.cc
+92
-42
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+6
-4
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+6
-4
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+8
-4
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+2
-1
paddle/fluid/memory/allocation/allocator.cc
paddle/fluid/memory/allocation/allocator.cc
+2
-5
paddle/fluid/memory/allocation/allocator.h
paddle/fluid/memory/allocation/allocator.h
+3
-3
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+49
-32
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
...e/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+123
-13
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
...le/fluid/memory/allocation/multi_bin_buffered_allocator.h
+1
-1
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+39
-36
paddle/fluid/memory/detail/buddy_allocator.h
paddle/fluid/memory/detail/buddy_allocator.h
+8
-3
paddle/fluid/memory/detail/memory_block.h
paddle/fluid/memory/detail/memory_block.h
+5
-4
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+190
-17
paddle/fluid/operators/benchmark/op_tester.h
paddle/fluid/operators/benchmark/op_tester.h
+9
-2
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+58
-20
paddle/fluid/operators/benchmark/op_tester_config.h
paddle/fluid/operators/benchmark/op_tester_config.h
+22
-0
paddle/fluid/operators/cast_op.cc
paddle/fluid/operators/cast_op.cc
+3
-1
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+3
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
...le/fluid/operators/detection/box_decoder_and_assign_op.cc
+169
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
...le/fluid/operators/detection/box_decoder_and_assign_op.cu
+147
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+103
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
.../fluid/operators/detection/distribute_fpn_proposals_op.cc
+93
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
.../fluid/operators/detection/distribute_fpn_proposals_op.cu
+221
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
...e/fluid/operators/detection/distribute_fpn_proposals_op.h
+147
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+10
-13
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+23
-0
paddle/fluid/operators/jit/gen/CMakeLists.txt
paddle/fluid/operators/jit/gen/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+91
-0
paddle/fluid/operators/jit/gen/vbroadcast.h
paddle/fluid/operators/jit/gen/vbroadcast.h
+53
-0
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+2
-0
paddle/fluid/operators/jit/kernel_base.h
paddle/fluid/operators/jit/kernel_base.h
+9
-0
paddle/fluid/operators/jit/kernel_key.cc
paddle/fluid/operators/jit/kernel_key.cc
+5
-0
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+2
-0
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+18
-0
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+10
-0
paddle/fluid/operators/jit/refer/CMakeLists.txt
paddle/fluid/operators/jit/refer/CMakeLists.txt
+2
-0
paddle/fluid/operators/jit/refer/refer.cc
paddle/fluid/operators/jit/refer/refer.cc
+3
-0
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+17
-0
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+67
-24
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+94
-0
paddle/fluid/operators/recurrent_op.cc
paddle/fluid/operators/recurrent_op.cc
+29
-16
paddle/fluid/operators/requantize_op.cc
paddle/fluid/operators/requantize_op.cc
+46
-0
paddle/fluid/operators/requantize_op.h
paddle/fluid/operators/requantize_op.h
+47
-0
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+4
-1
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+3
-0
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+63
-116
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+2
-0
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+58
-1
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+6
-0
paddle/fluid/platform/temporary_allocator.cc
paddle/fluid/platform/temporary_allocator.cc
+1
-0
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+2
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/imperative/layer_object_helper.py
python/paddle/fluid/imperative/layer_object_helper.py
+220
-0
python/paddle/fluid/imperative/layers.py
python/paddle/fluid/imperative/layers.py
+48
-1
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+32
-53
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+8
-9
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+14
-309
python/paddle/fluid/layer_helper_base.py
python/paddle/fluid/layer_helper_base.py
+381
-0
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+2
-2
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+137
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+3
-7
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+2
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+12
-7
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+16
-0
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+14
-0
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
...luid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+1
-14
python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+93
-0
python/paddle/fluid/tests/unittests/test_base_layer.py
python/paddle/fluid/tests/unittests/test_base_layer.py
+14
-24
python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
...e/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+96
-0
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+40
-0
python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
...fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+117
-0
python/paddle/fluid/tests/unittests/test_imperative_basic.py
python/paddle/fluid/tests/unittests/test_imperative_basic.py
+26
-26
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
...n/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+11
-12
未找到文件。
Dockerfile
浏览文件 @
2c4fcaa6
...
...
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN
wget
-qO-
http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz |
\
tar
-xz
-C
/usr/local
&&
\
RUN
wget
-q
https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz
--no-check-certificate
&&
\
tar
-zxf
TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz
-C
/usr/local
&&
\
cp
-rf
/usr/local/TensorRT/include /usr
&&
\
cp
-rf
/usr/local/TensorRT/lib /usr
...
...
paddle/fluid/API.spec
浏览文件 @
2c4fcaa6
...
...
@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '
60cb8f843d625abf33f8bf12455b8f99
'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '
992eb42590fc1c380841a6db72ce78b3
'))
paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
...
...
@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=
'ignored'
, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=
None
, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
...
...
@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=
'ignored'
, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=
None
, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
...
...
@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
...
...
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
2c4fcaa6
...
...
@@ -20,6 +20,9 @@
#include <numeric>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
...
...
@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
// valid the node is a var node
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
()
||
node
->
Name
()
==
kEmptyVarName
)
return
false
;
bool
flag
=
true
;
// op output force generated in cpu, can not be reused.
...
...
@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
if
(
shape
.
empty
()
||
size
<
MinChunkSize
())
{
return
false
;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std
::
string
name
=
node
.
Name
();
if
(
!
name
.
empty
()
&&
name
[
0
]
==
'@'
&&
name
[
name
.
size
()
-
1
]
==
'@'
)
return
false
;
return
true
;
}
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
2c4fcaa6
...
...
@@ -13,6 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
...
...
@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
auto
&
g
=
graphs
.
back
();
g
->
Set
(
kGraphVars
,
new
GraphVars
(
1UL
));
g
->
Set
(
kGraphDepVars
,
new
GraphDepVars
);
auto
&
stale_ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
);
g
->
Erase
(
details
::
kStaleProgramOpDescs
);
g
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
stale_ops
));
}
auto
op_handles
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
...
...
paddle/fluid/framework/
small
_stack.h
→
paddle/fluid/framework/
inlined
_stack.h
浏览文件 @
2c4fcaa6
...
...
@@ -14,7 +14,6 @@
#pragma once
#include <array>
#include <deque>
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -22,7 +21,7 @@ namespace paddle {
namespace
framework
{
template
<
typename
T
,
size_t
N
>
class
Small
Stack
{
class
Inlined
Stack
{
static_assert
(
N
>
0
,
"N must be larger than 0"
);
public:
...
...
@@ -66,8 +65,8 @@ class SmallStack {
private:
T
head_
[
N
];
size_t
size_
{
0
};
std
::
deque
<
T
>
tail_
;
size_t
size_
;
};
}
// namespace framework
...
...
paddle/fluid/framework/ir/fuse_pass_base.h
浏览文件 @
2c4fcaa6
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -24,6 +25,10 @@ namespace ir {
static
const
char
kParamScopeAttr
[]
=
"__param_scope__"
;
static
const
char
kFuseStatisAttr
[]
=
"__fuse_statis__"
;
// When we use trt or other third_party lib, the parameters are managed by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static
const
char
kRepetitiveParamAttr
[]
=
"__repetitive_param__"
;
enum
FuseOptions
{
DO_NOT_FUSE
,
// fusing will not be done
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
2c4fcaa6
...
...
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
std
::
vector
<
ir
::
Node
*>
nodes
;
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
nodes
.
push_back
(
adj_n
);
}
}
std
::
sort
(
nodes
.
begin
(),
nodes
.
end
(),
[](
ir
::
Node
*
node1
,
ir
::
Node
*
node2
)
{
return
node1
->
id
()
>
node2
->
id
();
});
adj_list
[
n
].
insert
(
std
::
make_move_iterator
(
nodes
.
begin
()),
std
::
make_move_iterator
(
nodes
.
end
()));
}
return
adj_list
;
}
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
const
Variable
*
ExecutionContext
::
LegacyInputVar
(
const
std
::
string
&
name
)
const
{
auto
ipt
=
op_
.
Input
(
name
);
return
ipt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
ipt
);
}
Variable
*
ExecutionContext
::
OutputVar
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
outputs
.
find
(
name
);
if
(
it
==
ctx_
.
outputs
.
end
())
return
nullptr
;
...
...
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
Variable
*
ExecutionContext
::
LegacyOutputVar
(
const
std
::
string
&
name
)
const
{
auto
opt
=
op_
.
Output
(
name
);
return
opt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
opt
);
}
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
Input
<
LoDTensor
>
(
name
);
}
template
<
>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyInput
<
LoDTensor
>
(
name
);
}
template
<
>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
...
...
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
return
res
;
}
template
<
>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
LegacyMultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
auto
names
=
op
().
Inputs
(
name
);
std
::
vector
<
const
Tensor
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
const
Tensor
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
if
(
var
==
nullptr
)
return
nullptr
;
PADDLE_ENFORCE
(
var
->
IsType
<
LoDTensor
>
(),
"%s should be LoDTensor, but the received type is %s"
,
sub_name
,
ToTypeName
(
var
->
Type
()));
return
&
(
var
->
Get
<
LoDTensor
>
());
});
return
res
;
}
template
<
>
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
Output
<
LoDTensor
>
(
name
);
}
template
<
>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyOutput
<
LoDTensor
>
(
name
);
}
template
<
>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
...
...
paddle/fluid/framework/operator.h
浏览文件 @
2c4fcaa6
...
...
@@ -16,9 +16,11 @@ limitations under the License. */
#include <algorithm>
#include <atomic>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
#include "glog/logging.h" // For VLOG
...
...
@@ -253,31 +255,6 @@ class ExecutionContext {
return
it
->
second
;
}
const
std
::
vector
<
Variable
*>
LegacyMultiInputVar
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Inputs
(
name
);
std
::
vector
<
Variable
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
this
](
const
std
::
string
&
name
)
{
return
name
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
name
);
});
return
res
;
}
std
::
vector
<
Variable
*>
LegacyMultiOutputVar
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Outputs
(
name
);
std
::
vector
<
Variable
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
this
](
const
std
::
string
&
name
)
{
return
name
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
name
);
});
return
res
;
}
template
<
typename
T
>
const
T
*
Input
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
InputVar
(
name
);
...
...
@@ -290,22 +267,6 @@ class ExecutionContext {
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
template
<
typename
T
>
const
T
*
LegacyInput
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
LegacyInputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
&
var
->
Get
<
T
>
();
}
template
<
typename
T
>
T
*
LegacyOutput
(
const
std
::
string
&
name
)
const
{
auto
var
=
LegacyOutputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
const
Variable
*
LegacyInputVar
(
const
std
::
string
&
name
)
const
;
Variable
*
LegacyOutputVar
(
const
std
::
string
&
name
)
const
;
template
<
typename
T
>
const
std
::
vector
<
const
T
*>
MultiInput
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
inputs
.
find
(
name
);
...
...
@@ -338,32 +299,6 @@ class ExecutionContext {
return
res
;
}
template
<
typename
T
>
const
std
::
vector
<
const
T
*>
LegacyMultiInput
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Inputs
(
name
);
std
::
vector
<
const
T
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
const
T
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
&
var
->
Get
<
T
>
();
});
return
res
;
}
template
<
typename
T
>
std
::
vector
<
T
*>
LegacyMultiOutput
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Outputs
(
name
);
std
::
vector
<
T
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
T
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
});
return
res
;
}
platform
::
Place
GetPlace
()
const
{
return
device_context_
.
GetPlace
();
}
template
<
typename
DeviceContextType
>
...
...
@@ -433,24 +368,13 @@ class ExecutionContext {
template
<>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
LegacyMultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
2c4fcaa6
...
...
@@ -23,8 +23,12 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -133,6 +137,8 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_precision_mode
,
TensorRtPrecisionMode
,
AnalysisConfig
::
Precision
);
DECL_ARGUMENT_FIELD
(
tensorrt_use_static_engine
,
TensorRtUseStaticEngine
,
bool
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
2c4fcaa6
...
...
@@ -17,10 +17,12 @@ limitations under the License. */
#include <sys/stat.h>
#include <cstdio>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
...
...
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
return
""
;
}
static
std
::
string
GetTrtEngineSerializedPath
(
const
std
::
string
&
model_root
,
const
std
::
string
&
engine_key
)
{
return
model_root
+
"/trt_serialized_"
+
engine_key
;
}
static
std
::
string
GetTrtEngineSerializedData
(
const
std
::
string
&
model_opt_cache_dir
,
const
std
::
string
&
engine_key
)
{
std
::
string
trt_serialized_path
=
GetTrtEngineSerializedPath
(
model_opt_cache_dir
,
engine_key
);
if
(
FileExists
(
trt_serialized_path
))
{
VLOG
(
3
)
<<
"Trt serialized file: "
<<
trt_serialized_path
<<
"is found here"
;
std
::
ifstream
infile
(
trt_serialized_path
,
std
::
ios
::
in
);
std
::
stringstream
buffer
;
buffer
<<
infile
.
rdbuf
();
std
::
string
trt_engine_serialized_data
(
buffer
.
str
());
return
trt_engine_serialized_data
;
}
return
""
;
}
static
void
SaveTrtEngineSerializedDataToFile
(
const
std
::
string
&
trt_serialized_path
,
const
std
::
string
&
engine_serialized_data
)
{
std
::
ofstream
outfile
(
trt_serialized_path
);
outfile
<<
engine_serialized_data
;
outfile
.
close
();
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
2c4fcaa6
...
...
@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"model_opt_cache_dir"
,
new
std
::
string
(
GetOrCreateModelOptCacheDir
(
model_opt_cache_dir
)));
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"use_static_engine"
,
new
bool
(
argument
->
tensorrt_use_static_engine
()));
}
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/analysis/ir_pass_manager.h
浏览文件 @
2c4fcaa6
...
...
@@ -22,7 +22,10 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
2c4fcaa6
...
...
@@ -14,13 +14,13 @@
#include <algorithm>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
...
...
@@ -33,8 +33,15 @@ using framework::ir::Node;
std
::
vector
<
std
::
string
>
ExtractParameters
(
const
std
::
unordered_set
<
Node
*>
&
nodes
);
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
analysis
::
TensorRtSubgraphPass
::
ApplyImpl
(
void
RenameAndGetOutputs
(
const
std
::
vector
<
framework
::
ir
::
Node
*>
&
subgraph_nodes
,
framework
::
BlockDesc
*
block_desc
,
const
std
::
set
<
std
::
string
>
&
input_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
output_name_map
);
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
analysis
::
TensorRtSubgraphPass
::
ApplyImpl
(
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
...
...
@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
fuser
();
std
::
vector
<
std
::
string
>
graph_param_names
=
ExtractParameters
(
graph
->
Nodes
());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateTensorRTOp
(
node
,
graph
.
get
());
CreateTensorRTOp
(
node
,
graph
.
get
(),
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
Agent
(
node
).
subgraph
()
->
begin
(),
Agent
(
node
).
subgraph
()
->
end
());
...
...
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
}
}
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
graph
->
Set
(
framework
::
ir
::
kRepetitiveParamAttr
,
new
std
::
vector
<
std
::
string
>
(
repetitive_params
));
return
graph
;
}
std
::
string
GenerateEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
const
std
::
set
<
std
::
string
>
&
engine_outputs
)
{
const
std
::
set
<
std
::
string
>
&
engine_outputs
,
const
std
::
string
&
predictor_id
)
{
std
::
string
engine_hash_key
=
""
;
for
(
auto
name
:
engine_inputs
)
{
engine_hash_key
+=
name
;
...
...
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
for
(
auto
name
:
engine_outputs
)
{
engine_hash_key
+=
name
;
}
engine_hash_key
+=
predictor_id
;
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
return
engine_key
;
}
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
)
const
{
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
{
auto
*
op_desc
=
node
->
Op
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
...
...
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique.
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names_with_id
;
std
::
vector
<
std
::
string
>
params
;
// The node->inputs containes input tensors and parameters.
for
(
auto
*
x
:
node
->
inputs
)
{
input_names
.
insert
(
x
->
Name
());
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
if
(
std
::
count
(
graph_params
.
begin
(),
graph_params
.
end
(),
x
->
Name
())
>
0
)
{
params
.
push_back
(
x
->
Name
());
}
}
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
std
::
set
<
std
::
string
>
output_names
;
std
::
set
<
std
::
string
>
output_names_with_id
;
...
...
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
}
op_desc
->
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
(
output_names
.
begin
(),
output_names
.
end
()));
op_desc
->
SetType
(
"tensorrt_engine"
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
output_name_map
;
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
...
...
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
for
(
auto
*
in_var
:
correspond_node
->
inputs
)
{
var2id
[
in_var
->
Name
()]
=
in_var
->
id
();
}
// rename for the input variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
inputs_size
();
i
++
)
{
// one input
auto
*
in_var
=
op
->
mutable_inputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
in_var
->
arguments_size
();
k
++
)
{
// all the arguments
std
::
string
arg_value
=
in_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
input_names_with_id
.
count
(
arg_value_with_id
))
{
replaced_names
.
push_back
(
arg_value
);
}
else
{
replaced_names
.
push_back
(
arg_value_with_id
);
}
}
in_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
in_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
var2id
.
clear
();
for
(
auto
out_var
:
correspond_node
->
outputs
)
{
var2id
[
out_var
->
Name
()]
=
out_var
->
id
();
}
// rename for the output variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
outputs_size
();
i
++
)
{
framework
::
proto
::
OpDesc_Var
*
out_var
=
op
->
mutable_outputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
out_var
->
arguments_size
();
k
++
)
{
std
::
string
arg_value
=
out_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
output_names_with_id
.
count
(
arg_value_with_id
))
{
output_name_map
[
arg_value
]
=
arg_value_with_id
;
}
replaced_names
.
push_back
(
arg_value_with_id
);
}
out_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
out_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
}
RenameAndGetOutputs
(
subgraph_nodes
,
&
block_desc
,
input_names_with_id
,
&
output_names_with_id
,
&
output_names
,
&
output_name_map
);
// When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
...
...
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE
(
output_name_map
.
count
(
name
)
!=
0
);
output_mapping
.
push_back
(
output_name_map
[
name
]);
}
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
auto
*
vars
=
block_desc
.
Proto
()
->
mutable_vars
();
for
(
framework
::
ir
::
Node
*
node
:
graph
->
Nodes
())
{
...
...
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE
(
!
block_desc
.
Proto
()
->
vars
().
empty
(),
"the block has no var-desc"
);
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
// Set attrs
op_desc
->
SetType
(
"tensorrt_engine"
);
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
op_desc
->
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
(
output_names
.
begin
(),
output_names
.
end
()));
op_desc
->
SetBlockAttr
(
"sub_block"
,
new_block
);
SetAttr
(
op_desc
->
Proto
(),
"subgraph"
,
block_desc
.
Proto
()
->
SerializeAsString
());
// Set attrs
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
params
);
auto
enable_int8
=
Get
<
bool
>
(
"enable_int8"
);
auto
engine_key
=
GenerateEngineKey
(
input_names_with_id
,
output_names_with_id
);
auto
engine_key
=
GenerateEngineKey
(
input_names_with_id
,
output_names_with_id
,
std
::
to_string
(
0
)
);
// Get "" when there is no cached calibration table data.
std
::
string
calibration_data
=
GetTrtCalibTableData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"calibration_data"
,
calibration_data
);
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
std
::
string
(
""
));
std
::
unique_ptr
<
tensorrt
::
TRTInt8Calibrator
>
calibrator
;
if
(
enable_int8
&&
calibration_data
.
size
()
!=
0
)
{
calibrator
.
reset
(
new
tensorrt
::
TRTInt8Calibrator
(
calibration_data
));
}
bool
use_static_engine
=
Get
<
bool
>
(
"use_static_engine"
);
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool
calibration_mode
=
(
enable_int8
&&
calibration_data
.
size
()
==
0
);
if
(
!
calibration_mode
&&
use_static_engine
)
{
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
std
::
string
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
if
(
trt_engine_serialized_data
.
empty
())
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
std
::
unique_ptr
<
tensorrt
::
TensorRTEngine
>
trt_engine
(
new
tensorrt
::
TensorRTEngine
(
Get
<
int
>
(
"max_batch_size"
),
Get
<
int
>
(
"workspace_size"
),
enable_int8
,
calibrator
.
get
(),
Get
<
int
>
(
"gpu_device_id"
)));
auto
*
scope
=
param_scope
();
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
.
Proto
());
std
::
unordered_set
<
std
::
string
>
param_set
(
params
.
begin
(),
params
.
end
());
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlockToTRTEngine
(
&
block_desc_temp
,
*
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
trt_engine
.
get
());
nvinfer1
::
IHostMemory
*
serialized_engine_data
=
trt_engine
->
Serialize
();
trt_engine_serialized_data
=
std
::
string
((
const
char
*
)
serialized_engine_data
->
data
(),
serialized_engine_data
->
size
());
SaveTrtEngineSerializedDataToFile
(
GetTrtEngineSerializedPath
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
),
trt_engine_serialized_data
);
}
else
{
LOG
(
INFO
)
<<
"Load TRT Optimized Info from "
<<
GetTrtEngineSerializedPath
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
}
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
trt_engine_serialized_data
);
}
}
std
::
vector
<
std
::
string
>
ExtractParameters
(
...
...
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
for
(
const
auto
&
node
:
nodes
)
{
if
(
!
node
->
IsOp
())
continue
;
std
::
string
op_type
=
node
->
Op
()
->
Type
();
if
(
op_type
==
"feed"
)
{
if
(
op_type
==
"feed"
||
op_type
==
"fetch"
)
{
std
::
vector
<
std
::
string
>
output_names
=
node
->
Op
()
->
OutputArgumentNames
();
std
::
copy
(
output_names
.
begin
(),
output_names
.
end
(),
std
::
back_inserter
(
feed_outputs
));
...
...
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
return
parameters
;
}
void
RenameAndGetOutputs
(
const
std
::
vector
<
framework
::
ir
::
Node
*>
&
subgraph_nodes
,
framework
::
BlockDesc
*
block_desc
,
const
std
::
set
<
std
::
string
>
&
input_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
output_name_map
)
{
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
same_hierarchy_conv2d_num_map
;
for
(
size_t
index
=
0
;
index
<
block_desc
->
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
->
Op
(
index
)
->
Proto
();
framework
::
OpDesc
op_desc
(
*
op
,
nullptr
);
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
std
::
unordered_map
<
std
::
string
,
framework
::
ir
::
Node
*>
in_vars
;
for
(
auto
*
in_var
:
correspond_node
->
inputs
)
{
var2id
[
in_var
->
Name
()]
=
in_var
->
id
();
in_vars
[
in_var
->
Name
()]
=
in_var
;
}
// rename for the input variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
inputs_size
();
i
++
)
{
// one input
auto
*
in_var
=
op
->
mutable_inputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
in_var
->
arguments_size
();
k
++
)
{
// all the arguments
std
::
string
arg_value
=
in_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
input_names_with_id
.
count
(
arg_value_with_id
))
{
replaced_names
.
push_back
(
arg_value
);
}
else
{
replaced_names
.
push_back
(
arg_value_with_id
);
}
}
in_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
in_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
var2id
.
clear
();
for
(
auto
out_var
:
correspond_node
->
outputs
)
{
var2id
[
out_var
->
Name
()]
=
out_var
->
id
();
}
if
(
op_desc
.
Type
()
==
"conv2d"
)
{
auto
input_var_name
=
op_desc
.
Input
(
"Input"
).
front
();
auto
filter_var_name
=
op_desc
.
Input
(
"Filter"
).
front
();
auto
out_var_name
=
op_desc
.
Output
(
"Output"
).
front
();
auto
filter_shape
=
in_vars
[
filter_var_name
]
->
Var
()
->
GetShape
();
const
std
::
vector
<
int
>
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
if
(
same_hierarchy_conv2d_num_map
[
input_var_name
]
>
0
)
{
(
*
output_names_with_id
)
.
insert
(
out_var_name
+
std
::
to_string
(
var2id
[
out_var_name
]));
(
*
output_names
).
insert
(
out_var_name
);
}
else
if
(
filter_shape
[
2
]
==
1
&&
filter_shape
[
3
]
==
1
&&
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
&&
paddings
[
0
]
==
0
&&
paddings
[
1
]
==
0
)
{
same_hierarchy_conv2d_num_map
[
input_var_name
]
+=
1
;
}
}
// rename for the output variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
outputs_size
();
i
++
)
{
framework
::
proto
::
OpDesc_Var
*
out_var
=
op
->
mutable_outputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
out_var
->
arguments_size
();
k
++
)
{
std
::
string
arg_value
=
out_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
output_names_with_id
->
count
(
arg_value_with_id
))
{
(
*
output_name_map
)[
arg_value
]
=
arg_value_with_id
;
}
replaced_names
.
push_back
(
arg_value_with_id
);
}
out_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
out_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
}
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
浏览文件 @
2c4fcaa6
...
...
@@ -13,7 +13,12 @@
// limitations under the License.
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
...
...
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
override
;
private:
void
CreateTensorRTOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
)
const
;
void
CreateTensorRTOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
;
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
};
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
2c4fcaa6
...
...
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if
(
!
argument
->
use_gpu
())
return
;
auto
&
graph
=
argument
->
main_graph
();
std
::
vector
<
std
::
string
>
repetitive_params
;
if
(
graph
.
Has
(
framework
::
ir
::
kRepetitiveParamAttr
))
repetitive_params
=
graph
.
Get
<
std
::
vector
<
std
::
string
>>
(
framework
::
ir
::
kRepetitiveParamAttr
);
LOG
(
INFO
)
<<
"Sync params from CPU to GPU"
;
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
...
...
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for
(
auto
&
var_name
:
all_vars
)
{
if
(
std
::
count
(
repetitive_params
.
begin
(),
repetitive_params
.
end
(),
var_name
))
{
continue
;
}
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
()
||
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
浏览文件 @
2c4fcaa6
...
...
@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
2c4fcaa6
...
...
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
CP_MEMBER
(
trt_use_static_engine_
);
// MKLDNN related.
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
...
...
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
void
AnalysisConfig
::
EnableTensorRtEngine
(
int
workspace_size
,
int
max_batch_size
,
int
min_subgraph_size
,
AnalysisConfig
::
Precision
precision_mode
)
{
AnalysisConfig
::
Precision
precision_mode
,
bool
use_static
)
{
#ifdef PADDLE_WITH_CUDA
if
(
!
use_gpu
())
{
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
...
...
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_precision_mode_
=
precision_mode
;
trt_use_static_engine_
=
use_static
;
Update
();
#else
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
2c4fcaa6
...
...
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
if
(
UNLIKELY
(
config_
.
cpu_math_library_num_threads
()
>
1
))
{
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
());
}
VLOG
(
3
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
timer
.
tic
();
...
...
@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
argument_
.
SetTensorRtMinSubgraphSize
(
config_
.
tensorrt_min_subgraph_size_
);
argument_
.
SetTensorRtPrecisionMode
(
config_
.
tensorrt_precision_mode_
);
argument_
.
SetTensorRtUseStaticEngine
(
config_
.
trt_use_static_engine_
);
}
if
(
config_
.
use_mkldnn_
)
{
...
...
@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
}
feeds_
[
idx
]
=
op
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
idx2feeds_
[
idx
]
=
op
->
Output
(
"Out"
)[
0
];
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetches_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetches_
.
resize
(
idx
+
1
);
}
fetches_
[
idx
]
=
op
;
idx2fetches_
[
idx
]
=
op
->
Input
(
"X"
)[
0
];
}
}
}
...
...
@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var
->
GetMutable
<
framework
::
FeedFetchList
>
();
}
std
::
vector
<
std
::
string
>
AnalysisPredictor
::
GetInputNames
()
{
std
::
vector
<
std
::
string
>
input_names
;
for
(
auto
&
item
:
idx2feeds_
)
{
input_names
.
push_back
(
item
.
second
);
}
return
input_names
;
}
std
::
vector
<
std
::
string
>
AnalysisPredictor
::
GetOutputNames
()
{
std
::
vector
<
std
::
string
>
output_names
;
for
(
auto
&
item
:
idx2fetches_
)
{
output_names
.
push_back
(
item
.
second
);
}
return
output_names
;
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetInputTensor
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
...
...
@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
true
;
res
->
SetName
(
name
);
if
(
platform
::
is_cpu_place
(
place_
))
{
res
->
SetPlace
(
PaddlePlace
::
kCPU
);
}
else
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
res
->
SetPlace
(
PaddlePlace
::
kGPU
,
gpu_place
.
GetDeviceId
());
}
return
res
;
}
...
...
@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
false
;
res
->
SetName
(
name
);
if
(
platform
::
is_cpu_place
(
place_
))
{
res
->
SetPlace
(
PaddlePlace
::
kCPU
);
}
else
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
res
->
SetPlace
(
PaddlePlace
::
kGPU
,
gpu_place
.
GetDeviceId
());
}
return
res
;
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
2c4fcaa6
...
...
@@ -15,12 +15,14 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING
...
...
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
=
-
1
)
override
;
std
::
vector
<
std
::
string
>
GetInputNames
();
std
::
vector
<
std
::
string
>
GetOutputNames
();
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
override
;
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
...
...
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
// Sorted according to the idx.
std
::
map
<
size_t
,
std
::
string
>
idx2feeds_
;
std
::
vector
<
framework
::
OpDesc
*>
fetches_
;
std
::
map
<
size_t
,
std
::
string
>
idx2fetches_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
2c4fcaa6
...
...
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
bool
NativePaddlePredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
if
(
UNLIKELY
(
config_
.
cpu_math_library_num_threads
()
>
1
))
{
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
());
}
VLOG
(
3
)
<<
"Predictor::predict"
;
Timer
timer
;
timer
.
tic
();
...
...
paddle/fluid/inference/api/details/zero_copy_tensor.cc
浏览文件 @
2c4fcaa6
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return
res
;
}
template
<
typename
T
>
void
ZeroCopyTensor
::
copy_from_cpu
(
const
T
*
data
)
{
EAGER_GET_TENSOR
;
PADDLE_ENFORCE_GE
(
tensor
->
numel
(),
0
,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu."
);
size_t
ele_size
=
tensor
->
numel
()
*
sizeof
(
T
);
if
(
place_
==
PaddlePlace
::
kCPU
)
{
auto
*
t_data
=
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
memcpy
(
static_cast
<
void
*>
(
t_data
),
data
,
ele_size
);
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
CUDAPlace
gpu_place
(
device_
);
auto
*
t_data
=
tensor
->
mutable_data
<
T
>
(
gpu_place
);
auto
*
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
memory
::
Copy
(
gpu_place
,
static_cast
<
void
*>
(
t_data
),
platform
::
CPUPlace
(),
data
,
ele_size
,
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
}
template
<
typename
T
>
void
ZeroCopyTensor
::
copy_to_cpu
(
T
*
data
)
{
EAGER_GET_TENSOR
;
auto
ele_num
=
tensor
->
numel
();
auto
*
t_data
=
tensor
->
data
<
T
>
();
auto
t_place
=
tensor
->
place
();
if
(
platform
::
is_cpu_place
(
t_place
))
{
std
::
memcpy
(
static_cast
<
void
*>
(
data
),
t_data
,
ele_num
*
sizeof
(
T
));
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
t_place
);
auto
*
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
memory
::
Copy
(
platform
::
CPUPlace
(),
static_cast
<
void
*>
(
data
),
gpu_place
,
t_data
,
ele_num
*
sizeof
(
T
),
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
}
template
void
ZeroCopyTensor
::
copy_from_cpu
<
float
>(
const
float
*
data
);
template
void
ZeroCopyTensor
::
copy_from_cpu
<
int64_t
>(
const
int64_t
*
data
);
template
void
ZeroCopyTensor
::
copy_to_cpu
<
float
>(
float
*
data
);
template
void
ZeroCopyTensor
::
copy_to_cpu
<
int64_t
>(
int64_t
*
data
);
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
)
const
;
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
...
...
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return
tensor
;
}
std
::
vector
<
int
64_t
>
ZeroCopyTensor
::
shape
()
const
{
std
::
vector
<
int
>
ZeroCopyTensor
::
shape
()
const
{
EAGER_GET_TENSOR
;
PADDLE_ENFORCE
(
tensor_
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
return
framework
::
vectorize
2int
(
tensor
->
dims
());
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
...
...
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
浏览文件 @
2c4fcaa6
...
...
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
return
nullptr
;
}
std
::
vector
<
int
64_t
>
ZeroCopyTensor
::
shape
()
const
{
return
{};
}
std
::
vector
<
int
>
ZeroCopyTensor
::
shape
()
const
{
return
{};
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{}
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
2c4fcaa6
...
...
@@ -50,6 +50,11 @@ class Timer {
}
};
static
int
GetUniqueId
()
{
static
int
id
=
0
;
return
id
++
;
}
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
pieces
->
clear
();
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
2c4fcaa6
...
...
@@ -135,7 +135,8 @@ struct AnalysisConfig {
*/
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
,
Precision
precision
=
Precision
::
kFloat32
);
Precision
precision
=
Precision
::
kFloat32
,
bool
use_static
=
true
);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
...
...
@@ -233,6 +234,7 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
Precision
tensorrt_precision_mode_
;
bool
trt_use_static_engine_
;
// memory reuse related.
bool
enable_memory_optim_
{
false
};
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
2c4fcaa6
...
...
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
std
::
vector
<
int64_t
>
shape
()
const
;
template
<
typename
T
>
void
copy_from_cpu
(
const
T
*
data
);
template
<
typename
T
>
void
copy_to_cpu
(
T
*
data
);
std
::
vector
<
int
>
shape
()
const
;
void
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>&
x
);
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetPlace
(
PaddlePlace
place
,
int
device
=
-
1
)
{
place_
=
place
;
device_
=
device
;
}
protected:
explicit
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
...
...
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable
void
*
tensor_
{
nullptr
};
PaddlePlace
place_
;
int
device_
;
};
/** A simple Inference API for Paddle.
...
...
@@ -200,6 +212,14 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
/** \brief Get input names of the model
*/
virtual
std
::
vector
<
std
::
string
>
GetInputNames
()
{
return
{};
}
/** \brief Get output names of the model
*/
virtual
std
::
vector
<
std
::
string
>
GetOutputNames
()
{
return
{};
}
/** \brief Get a mutable tensor directly.
*
* NOTE Only works in AnalysisPredictor.
...
...
paddle/fluid/inference/engine.h
浏览文件 @
2c4fcaa6
...
...
@@ -49,11 +49,6 @@ class EngineBase {
// Execute the engine, that will run the inference network.
virtual
void
Execute
(
int
batch_size
)
=
0
;
// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual
Buffer
&
buffer
(
const
std
::
string
&
name
)
=
0
;
virtual
~
EngineBase
()
{}
};
// class EngineBase
...
...
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -18,21 +18,6 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
bool
to_skip_merging_optimize
(
TensorRTEngine
*
engine
,
const
std
::
vector
<
int
>&
filters
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
std
::
string
input_name
)
{
if
(
engine
->
itensor_quote_num
[
input_name
]
>
0
)
{
return
true
;
}
if
(
filters
[
0
]
==
1
&&
filters
[
1
]
==
1
&&
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
&&
paddings
[
0
]
==
0
&&
paddings
[
1
]
==
0
)
engine
->
itensor_quote_num
[
input_name
]
+=
1
;
return
false
;
}
template
<
typename
RegistFunc
,
typename
SetDilationFunc
>
void
ConvertConv2d
(
TensorRTEngine
*
engine
,
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
,
...
...
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
weight_tensor
->
Resize
(
Y_t
->
dims
());
TensorCopySync
((
*
Y_t
),
cpu_place
,
weight_tensor
.
get
());
auto
*
weight_data
=
weight_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()
);
auto
*
weight_data
=
weight_tensor
->
mutable_data
<
float
>
(
cpu_place
);
PADDLE_ENFORCE_EQ
(
weight_tensor
->
dims
().
size
(),
4UL
);
const
int
n_output
=
weight_tensor
->
dims
()[
0
];
...
...
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
||
to_skip_merging_optimize
(
engine
,
{
filter_h
,
filter_w
},
strides
,
paddings
,
op_desc
.
Input
(
"Input"
).
front
()))
{
if
(
test_mode
)
{
engine
->
DeclareOutput
(
output_name
);
}
}
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
if
(
CheckDims
(
dims_x
,
dims_y
))
{
// The two input tensor should have the same dims
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IElementWiseLayer"
;
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
...
...
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
"ElementWisePluginLayer"
;
plugin
::
ElementWisePlugin
*
plugin
=
new
plugin
::
ElementWisePlugin
(
op_
pair
->
second
,
dims_x
,
dims_y
,
axis
);
new
plugin
::
ElementWisePlugin
(
op_
type_
,
dims_x
,
dims_y
,
axis
);
plugin
->
AddInput
(
X
);
plugin
->
AddInput
(
Y
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
Y_t
->
dims
()[
0
]
*
Y_t
->
dims
()[
1
]
*
sizeof
(
float
));
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
static_cast
<
size_t
>
(
Y_t
->
numel
()
)};
TensorRTEngine
::
Weight
tmp_weight
(
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
tmp
->
data
<
float
>
()),
Y_t
->
memory_size
()
/
sizeof
(
float
));
static_cast
<
size_t
>
(
Y_t
->
numel
()
));
weight
.
dims
.
assign
({
Y_t
->
dims
()[
0
],
Y_t
->
dims
()[
1
]});
tmp_weight
.
dims
=
weight
.
dims
;
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
2c4fcaa6
...
...
@@ -16,9 +16,12 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
...
...
@@ -26,6 +29,37 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
// NOLINT
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
case
FluidDT
::
VarType_Type_FP32
:
return
TRT_DT
::
kFLOAT
;
case
FluidDT
::
VarType_Type_INT32
:
return
TRT_DT
::
kINT32
;
default:
return
TRT_DT
::
kINT32
;
}
PADDLE_THROW
(
"unkown type"
);
return
TRT_DT
::
kINT32
;
}
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>&
shape
)
{
PADDLE_ENFORCE_GT
(
shape
.
size
(),
1UL
,
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
"TensorRT' tensor input requires at most 4 dimensions"
);
PADDLE_ENFORCE
(
shape
.
size
()
==
4UL
||
shape
.
size
()
==
2UL
);
if
(
shape
.
size
()
==
4UL
)
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
shape
[
2
],
shape
[
3
]);
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
1
,
1
);
}
}
// namespace // NOLINT
/*
* Convert Op from Fluid to TensorRT Engine.
*/
...
...
@@ -110,6 +144,34 @@ class OpConverter {
}
}
// The scope here should be inited with the parameter vars.
void
ConvertBlockToTRTEngine
(
framework
::
BlockDesc
*
block_desc
,
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
const
std
::
vector
<
std
::
string
>&
outputs
,
TensorRTEngine
*
engine
)
{
engine
->
InitNetwork
();
for
(
auto
&
input
:
inputs
)
{
if
(
parameters
.
count
(
input
))
continue
;
auto
*
var
=
block_desc
->
FindVar
(
input
);
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
auto
var_shape
=
var
->
GetShape
();
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
var_shape
));
}
framework
::
proto
::
BlockDesc
*
block_proto
=
block_desc
->
Proto
();
ConvertBlock
(
*
block_proto
,
parameters
,
scope
,
engine
);
for
(
auto
&
output
:
outputs
)
{
engine
->
DeclareOutput
(
output
);
}
engine
->
FreezeNetwork
();
}
void
SetEngine
(
TensorRTEngine
*
engine
)
{
engine_
=
engine
;
}
virtual
~
OpConverter
()
{}
...
...
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL
(
alpha_var
);
auto
*
alpha_tensor
=
alpha_var
->
GetMutable
<
framework
::
LoDTensor
>
();
platform
::
C
UDAPlace
place
;
std
::
unique_ptr
<
framework
::
LoDTensor
>
alpha_tensor_
device
(
platform
::
C
PUPlace
cpu_
place
;
std
::
unique_ptr
<
framework
::
LoDTensor
>
alpha_tensor_
temp
(
new
framework
::
LoDTensor
());
alpha_tensor_
device
->
Resize
(
alpha_tensor
->
dims
());
TensorCopySync
(
*
alpha_tensor
,
place
,
alpha_tensor_device
.
get
());
float
*
alpha_data
=
alpha_tensor_
device
->
mutable_data
<
float
>
(
place
);
alpha_tensor_
temp
->
Resize
(
alpha_tensor
->
dims
());
TensorCopySync
(
*
alpha_tensor
,
cpu_place
,
alpha_tensor_temp
.
get
());
float
*
alpha_data
=
alpha_tensor_
temp
->
mutable_data
<
float
>
(
cpu_
place
);
// Transform alpha to TensorRTEngine::Weight
TensorRTEngine
::
Weight
alpha_rt
(
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
alpha_data
),
alpha_tensor_device
->
numel
());
plugin
::
PReluPlugin
*
plugin
=
new
plugin
::
PReluPlugin
(
alpha_rt
,
mode
);
plugin
::
PReluPlugin
*
plugin
=
new
plugin
::
PReluPlugin
(
alpha_data
,
alpha_tensor_temp
->
numel
(),
mode
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
// keep alpha tensor to avoid release it's memory
engine_
->
weight_map
[
op_desc
.
Input
(
"Alpha"
)[
0
]]
=
std
::
move
(
alpha_tensor_
device
);
std
::
move
(
alpha_tensor_
temp
);
std
::
string
layer_name
=
"prelu (Output: "
;
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
2c4fcaa6
...
...
@@ -19,7 +19,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
...
...
@@ -79,7 +81,8 @@ class TRTConvertValidation {
if_add_batch_
(
if_add_batch
),
max_batch_size_
(
max_batch_size
)
{
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size
,
workspace_size
,
stream_
));
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size
,
workspace_size
,
false
,
nullptr
,
0
));
engine_
->
InitNetwork
();
}
...
...
@@ -114,13 +117,12 @@ class TRTConvertValidation {
}
void
DeclVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
dim_vec
)
{
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
platform
::
CUDADeviceContext
ctx
(
place_
);
auto
*
x
=
scope_
.
Var
(
name
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
RandomizeTensor
(
x_tensor
,
place
,
ctx
);
RandomizeTensor
(
x_tensor
,
place
_
,
ctx
);
}
// Declare a variable in a fluid Scope.
void
DeclVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
,
...
...
@@ -146,19 +148,6 @@ class TRTConvertValidation {
// Declare outputs.
op_desc_
.
reset
(
new
framework
::
OpDesc
(
desc
,
nullptr
));
// Set Inputs.
for
(
const
auto
&
input
:
op_desc_
->
InputArgumentNames
())
{
if
(
parameters_
.
count
(
input
))
continue
;
auto
*
var
=
scope_
.
FindVar
(
input
);
PADDLE_ENFORCE
(
var
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
engine_
->
SetInputFromGPU
(
input
,
static_cast
<
void
*>
(
tensor
->
data
<
void
>
()),
sizeof
(
float
)
*
analysis
::
AccuDims
(
tensor
->
dims
(),
tensor
->
dims
().
size
()));
}
}
// We use the set 'neglected_output' here, because some Ops like batch norm,
...
...
@@ -168,43 +157,71 @@ class TRTConvertValidation {
std
::
unordered_set
<
std
::
string
>
neglected_output
=
{})
{
// Execute Fluid Op
PADDLE_ENFORCE_LE
(
batch_size
,
max_batch_size_
);
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
op_
->
Run
(
scope_
,
place
);
// Execute TRT.
engine_
->
Execute
(
batch_size
);
cudaStreamSynchronize
(
engine_
->
stream
());
platform
::
CUDADeviceContext
ctx
(
place_
);
op_
->
Run
(
scope_
,
place_
);
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
const
size_t
output_space_size
=
3000
;
std
::
vector
<
std
::
string
>
input_output_names
;
// Note: we need filter the parameter
for
(
const
auto
&
input
:
op_desc_
->
InputArgumentNames
())
{
if
(
parameters_
.
count
(
input
))
continue
;
input_output_names
.
push_back
(
input
);
}
// Collect the fluid outputs.
std
::
vector
<
std
::
vector
<
float
>>
fluid_outs
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
input_output_names
.
push_back
(
output
);
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
trt_out
(
output_space_size
);
engine_
->
GetOutputInCPU
(
output
,
&
trt_out
[
0
],
output_space_size
);
cudaStreamSynchronize
(
engine_
->
stream
());
auto
*
var
=
scope_
.
FindVar
(
output
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
fluid_out
);
fluid_outs
.
push_back
(
fluid_out
);
}
// Bind input and output for TRT.
const
int
num_bindings
=
input_output_names
.
size
();
std
::
vector
<
void
*>
buffers
(
num_bindings
);
for
(
const
std
::
string
&
name
:
input_output_names
)
{
auto
*
var
=
scope_
.
FindVar
(
name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
const
int
bind_index
=
engine_
->
engine
()
->
getBindingIndex
(
name
.
c_str
());
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
tensor
->
mutable_data
<
float
>
(
place_
));
}
// Execute TRT.
engine_
->
Execute
(
batch_size
,
&
buffers
,
stream_
);
size_t
fluid_out_size
=
fluid_out
.
size
();
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
int
index
=
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
std
::
vector
<
float
>
trt_out
;
auto
*
var
=
scope_
.
FindVar
(
output
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
trt_out
);
size_t
fluid_out_size
=
fluid_outs
[
index
].
size
();
if
(
if_add_batch_
==
true
)
{
fluid_out_size
=
batch_size
*
(
framework
::
product
(
tensor
->
dims
())
/
max_batch_size_
);
}
// Compare two output
ASSERT_FALSE
(
fluid_out
.
empty
());
for
(
size_t
i
=
0
;
i
<
fluid_out_size
;
i
++
)
{
// Loose the threshold for CI in different machine model.
EXPECT_LT
(
std
::
abs
(
fluid_out
[
i
]
-
trt_out
[
i
]),
2e-5
);
EXPECT_LT
(
std
::
abs
(
fluid_out
s
[
index
]
[
i
]
-
trt_out
[
i
]),
2e-5
);
}
index
+=
1
;
}
}
framework
::
Scope
&
scope
()
{
return
scope_
;
}
private:
platform
::
CUDAPlace
place_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
cudaStream_t
stream_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
2c4fcaa6
...
...
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE
(
false
,
"not implemented"
);
}
void
TensorRTEngine
::
Execute
(
int
batch_size
)
{
void
TensorRTEngine
::
Execute
(
int
batch_size
,
std
::
vector
<
void
*>
*
buffers
,
cudaStream_t
stream
)
{
freshDeviceId
();
batch_size_
=
batch_size
;
std
::
vector
<
void
*>
buffers
;
for
(
auto
&
buf
:
buffers_
)
{
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated"
);
PADDLE_ENFORCE_GT
(
buf
.
max_size
,
0
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
buffers
.
push_back
(
buf
.
buffer
);
}
infer_context_
->
enqueue
(
batch_size
,
buffers
.
data
(),
stream_
,
nullptr
);
cudaStreamSynchronize
(
stream_
);
infer_context_
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
cudaStreamSynchronize
(
stream
);
SetRuntimeBatch
(
batch_size
);
}
TensorRTEngine
::~
TensorRTEngine
()
{
cudaStreamSynchronize
(
stream_
);
// clean buffer
for
(
auto
&
buf
:
buffers_
)
{
if
(
buf
.
device
==
DeviceType
::
GPU
&&
buf
.
buffer
!=
nullptr
)
{
PADDLE_ENFORCE_EQ
(
0
,
cudaFree
(
buf
.
buffer
));
buf
.
buffer
=
nullptr
;
buf
.
max_size
=
0
;
}
}
}
void
TensorRTEngine
::
FreezeNetwork
()
{
VLOG
(
3
)
<<
"TRT to freeze network"
;
freshDeviceId
();
VLOG
(
3
)
<<
"TRT to freeze network"
;
PADDLE_ENFORCE
(
infer_builder_
!=
nullptr
,
"Call InitNetwork first to initialize network."
);
PADDLE_ENFORCE
(
infer_network_
!=
nullptr
,
...
...
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed!"
);
infer_context_
.
reset
(
infer_engine_
->
createExecutionContext
());
// allocate GPU buffers.
buffers_
.
resize
(
buffer_sizes_
.
size
());
for
(
auto
&
item
:
buffer_sizes_
)
{
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if
(
item
.
second
==
0
)
{
auto
slot_offset
=
infer_engine_
->
getBindingIndex
(
item
.
first
.
c_str
());
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
item
.
second
=
kDataTypeSize
[
static_cast
<
int
>
(
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
PADDLE_ENFORCE_GT
(
item
.
second
,
0
);
}
auto
&
buf
=
buffer
(
item
.
first
);
buf
.
max_size
=
item
.
second
*
max_batch_
;
CHECK
(
buf
.
buffer
==
nullptr
);
// buffer should be allocated only once.
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
item
.
second
*
max_batch_
));
buf
.
size
=
0
;
PADDLE_ENFORCE_LE
(
buf
.
max_size
,
1
<<
30
);
// 10G
buf
.
device
=
DeviceType
::
GPU
;
}
}
nvinfer1
::
ITensor
*
TensorRTEngine
::
DeclareInput
(
const
std
::
string
&
name
,
...
...
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
buffer_sizes_
[
name
]
=
0
;
}
void
*
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
)
{
return
buffer
(
name
).
buffer
;
}
void
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToDevice
,
stream_
),
0
);
}
void
TensorRTEngine
::
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToHost
,
stream_
));
}
Buffer
&
TensorRTEngine
::
buffer
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"call FreezeNetwork first."
);
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
(),
"tried to access buffer named %s"
,
name
);
auto
slot_offset
=
infer_engine_
->
getBindingIndex
(
name
.
c_str
());
return
buffers_
[
slot_offset
];
}
void
TensorRTEngine
::
SetInputFromCPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
)
{
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
);
PADDLE_ENFORCE_NOT_NULL
(
data
);
PADDLE_ENFORCE_LE
(
size
,
buf
.
max_size
,
"buffer is too small"
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
buf
.
size
=
size
;
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
buf
.
buffer
,
data
,
size
,
cudaMemcpyHostToDevice
,
stream_
));
}
void
TensorRTEngine
::
SetInputFromGPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
)
{
auto
&
buf
=
buffer
(
name
);
buf
.
size
=
size
;
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
);
PADDLE_ENFORCE_LE
(
size
,
buf
.
max_size
,
"buffer is too small"
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
buf
.
buffer
,
data
,
size
,
cudaMemcpyDeviceToDevice
,
stream_
));
}
void
TensorRTEngine
::
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
)
{
PADDLE_ENFORCE
(
tensor
!=
nullptr
);
...
...
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
int
TensorRTEngine
::
GetRuntimeBatch
()
{
return
runtime_batch_
;
}
void
TensorRTEngine
::
freshDeviceId
()
{
int
count
;
cudaGetDeviceCount
(
&
count
);
PADDLE_ENFORCE_LT
(
device_
,
count
);
cudaSetDevice
(
device_
);
}
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
plugin
)
{
...
...
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
num_inputs
,
*
plugin
);
}
void
TensorRTEngine
::
freshDeviceId
()
{
int
count
;
cudaGetDeviceCount
(
&
count
);
PADDLE_ENFORCE_LT
(
device_id_
,
count
);
cudaSetDevice
(
device_id_
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
2c4fcaa6
...
...
@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h"
...
...
@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manully construct the network.
*/
class
TensorRTEngine
:
public
EngineBase
{
class
TensorRTEngine
{
using
DescType
=
::
paddle
::
framework
::
proto
::
BlockDesc
;
public:
// Weight is model parameter.
class
Weight
{
...
...
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
nvinfer1
::
Weights
w_
;
};
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
stream
,
int
device
=
0
,
bool
enable_int8
=
false
,
TRTInt8Calibrator
*
calibrator
=
nullptr
,
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
bool
enable_int8
=
false
,
TRTInt8Calibrator
*
calibrator
=
nullptr
,
int
device_id
=
0
,
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
:
max_batch_
(
max_batch
),
max_workspace_
(
max_workspace
),
stream_
(
stream
),
device_
(
device
),
enable_int8_
(
enable_int8
),
calibrator_
(
calibrator
),
device_id_
(
device_id
),
logger_
(
logger
)
{}
virtual
~
TensorRTEngine
();
~
TensorRTEngine
()
{}
// TODO(Superjomn) implement it later when graph segmentation is supported.
void
Build
(
const
DescType
&
paddle_model
)
override
;
void
Build
(
const
DescType
&
paddle_model
);
void
Execute
(
int
batch_size
)
override
;
void
Execute
(
int
batch_size
,
std
::
vector
<
void
*>*
buffers
,
cudaStream_t
stream
);
// Initialize the inference network, so that TensorRT layers can add to this
// network.
void
InitNetwork
()
{
freshDeviceId
();
infer_builder_
.
reset
(
createInferBuilder
(
&
logger_
));
infer_network_
.
reset
(
infer_builder_
->
createNetwork
());
}
...
...
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
// Check if the ITensor has been declared
bool
HasDeclared
(
const
std
::
string
&
name
);
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
// data directly to the buffer to save data copy overhead.
// NOTE this should be used after calling `FreezeNetwork`.
Buffer
&
buffer
(
const
std
::
string
&
name
)
override
;
cudaStream_t
stream
()
{
return
stream_
;
}
// Fill an input from CPU memory with name and size.
void
SetInputFromCPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
);
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be
// accessed directly. Fill an input from GPU memory with name and size.
void
SetInputFromGPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
);
// Get an output called name, the output of tensorrt is in GPU, so this method
// Return the output's GPU memory address without copy.
void
*
GetOutputInGPU
(
const
std
::
string
&
name
);
// Copy data into dst inside the GPU device.
void
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU.
void
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
// Fill an ITensor into map itensor_map_.
void
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
);
// Get an ITensor called name.
nvinfer1
::
ITensor
*
GetITensor
(
const
std
::
string
&
name
);
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
nvinfer1
::
IHostMemory
*
Serialize
()
{
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"You should build engine first and then serialize"
);
ihost_memory_
.
reset
(
infer_engine_
->
serialize
());
return
ihost_memory_
.
get
();
}
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
(),
&
inference
::
Singleton
<
plugin
::
PluginFactoryTensorRT
>::
Global
()));
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed when deserialize engine info.!"
);
infer_context_
.
reset
(
infer_engine_
->
createExecutionContext
());
}
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
int
GetDevice
()
{
return
device
_
;
}
int
GetDevice
Id
()
{
return
device_id
_
;
}
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
);
...
...
@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
weight_map
;
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this
// optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
itensor_quote_num
;
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void
freshDeviceId
();
// the max batch size
int
max_batch_
;
// the runtime batch size
...
...
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses
int
max_workspace_
;
cudaStream_t
stream_
;
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
bool
enable_int8_
;
TRTInt8Calibrator
*
calibrator_
;
// batch size of the current data, will be updated each Executation.
int
batch_size_
{
-
1
};
int
device_id_
;
nvinfer1
::
ILogger
&
logger_
;
std
::
vector
<
Buffer
>
buffers_
;
// max data size for the buffers.
std
::
unordered_map
<
std
::
string
/*name*/
,
size_t
/*max size*/
>
buffer_sizes_
;
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
...
...
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
infer_ptr
<
nvinfer1
::
INetworkDefinition
>
infer_network_
;
infer_ptr
<
nvinfer1
::
ICudaEngine
>
infer_engine_
;
infer_ptr
<
nvinfer1
::
IExecutionContext
>
infer_context_
;
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IHostMemory
>
ihost_memory_
;
};
// class TensorRTEngine
// Add an layer__ into engine__ with args ARGS.
// For example:
// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
//
// Reference
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
...
...
paddle/fluid/inference/tensorrt/helper.h
浏览文件 @
2c4fcaa6
...
...
@@ -17,6 +17,9 @@
#include <NvInfer.h>
#include <cuda.h>
#include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/dynload/tensorrt.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
~
NaiveLogger
()
override
{}
};
class
NaiveProfiler
:
public
nvinfer1
::
IProfiler
{
public:
typedef
std
::
pair
<
std
::
string
,
float
>
Record
;
std
::
vector
<
Record
>
mProfile
;
virtual
void
reportLayerTime
(
const
char
*
layerName
,
float
ms
)
{
auto
record
=
std
::
find_if
(
mProfile
.
begin
(),
mProfile
.
end
(),
[
&
](
const
Record
&
r
)
{
return
r
.
first
==
layerName
;
});
if
(
record
==
mProfile
.
end
())
mProfile
.
push_back
(
std
::
make_pair
(
layerName
,
ms
));
else
record
->
second
+=
ms
;
}
void
printLayerTimes
()
{
float
totalTime
=
0
;
for
(
size_t
i
=
0
;
i
<
mProfile
.
size
();
i
++
)
{
printf
(
"%-40.40s %4.3fms
\n
"
,
mProfile
[
i
].
first
.
c_str
(),
mProfile
[
i
].
second
);
totalTime
+=
mProfile
[
i
].
second
;
}
printf
(
"Time over all layers: %4.3f
\n
"
,
totalTime
);
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
浏览文件 @
2c4fcaa6
nv_library
(
tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu trt_plugin_factory.cc
avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine prelu
)
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
浏览文件 @
2c4fcaa6
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace
paddle
{
...
...
@@ -20,6 +21,12 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
AvgPoolPlugin
*
CreateAvgPoolPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
AvgPoolPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"avg_pool_plugin"
,
CreateAvgPoolPluginDeserialize
);
nvinfer1
::
Dims
AvgPoolPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
...
...
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
浏览文件 @
2c4fcaa6
...
...
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
ceil_mode_
)
+
SerializedSize
(
ksize_
)
+
SerializedSize
(
strides_
)
+
SerializedSize
(
paddings_
)
+
SerializedSize
(
input_shape_
)
+
getBaseSerializationSize
();
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
ceil_mode_
)
+
SerializedSize
(
ksize_
)
+
SerializedSize
(
strides_
)
+
SerializedSize
(
paddings_
)
+
SerializedSize
(
input_shape_
)
+
SerializedSize
(
output_shape_
)
+
getBaseSerializationSize
();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
ceil_mode_
);
SerializeValue
(
&
buffer
,
ksize_
);
SerializeValue
(
&
buffer
,
strides_
);
SerializeValue
(
&
buffer
,
paddings_
);
SerializeValue
(
&
buffer
,
input_shape_
);
SerializeValue
(
&
buffer
,
output_shape_
);
}
public:
AvgPoolPlugin
()
{}
AvgPoolPlugin
(
bool
ceil_mode
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
std
::
vector
<
int
>
input_shape
)
...
...
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
strides_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
paddings_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
input_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
output_shape_
);
}
AvgPoolPlugin
*
clone
()
const
override
{
...
...
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
input_shape_
);
}
const
char
*
getPluginType
()
const
override
{
return
"avg_pool"
;
}
const
char
*
getPluginType
()
const
override
{
return
"avg_pool
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
...
...
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
浏览文件 @
2c4fcaa6
...
...
@@ -14,12 +14,19 @@ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
ElementWisePlugin
*
CreateElementWisePluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
ElementWisePlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"elementwise_plugin"
,
CreateElementWisePluginDeserialize
);
namespace
details
{
template
<
typename
T
>
...
...
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
const
float
*
y
=
reinterpret_cast
<
const
float
*>
(
inputs
[
1
]);
float
*
out
=
reinterpret_cast
<
float
*>
(
outputs
[
0
]);
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kSUM
)
{
if
(
type_
==
"add"
)
{
details
::
ElementWise
(
details
::
Add
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kPROD
)
{
}
else
if
(
type_
==
"mul"
)
{
details
::
ElementWise
(
details
::
Mul
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
{
...
...
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
浏览文件 @
2c4fcaa6
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
...
@@ -24,9 +25,8 @@ namespace plugin {
class
ElementWisePlugin
:
public
PluginTensorRT
{
public:
ElementWisePlugin
(
nvinfer1
::
ElementWiseOperation
type
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_y
,
int
axis
)
ElementWisePlugin
(
std
::
string
type
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_y
,
int
axis
)
:
type_
(
type
),
dims_x_
(
dims_x
),
dims_y_
(
dims_y
),
...
...
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
ElementWisePlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
const
char
*
elementwise_type
;
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
elementwise_type
);
type_
=
std
::
string
(
elementwise_type
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_x_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_y_
);
...
...
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
return
nullptr
;
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise"
;
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise
_plugin
"
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
...
...
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_x_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_x_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
}
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
type_
.
c_str
());
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
dims_x_
);
SerializeValue
(
&
buffer
,
dims_y_
);
}
nvinfer1
::
ElementWiseOperation
type_
;
std
::
string
type_
;
nvinfer1
::
Dims
dims_x_
;
nvinfer1
::
Dims
dims_y_
;
int
axis_
;
...
...
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
浏览文件 @
2c4fcaa6
...
...
@@ -17,6 +17,7 @@
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace
paddle
{
...
...
@@ -24,6 +25,17 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
PReluPlugin
*
CreatePreluPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
PReluPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"prelu_plugin"
,
CreatePreluPluginDeserialize
);
int
PReluPlugin
::
initialize
()
{
cudaMalloc
(
&
p_gpu_weight_
,
sizeof
(
float
)
*
weight_
.
size
());
cudaMemcpy
(
p_gpu_weight_
,
weight_
.
data
(),
weight_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
nvinfer1
::
Dims
PReluPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
...
...
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
// input dims is CHW.
const
auto
&
input_dims
=
this
->
getInputDims
(
0
);
const
float
*
input
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
const
float
*
alpha
=
reinterpret_cast
<
const
float
*>
(
alpha_
.
get
().
values
);
// const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
const
float
*
alpha
=
p_gpu_weight_
;
float
*
output
=
reinterpret_cast
<
float
**>
(
outputs
)[
0
];
std
::
vector
<
int
>
input_shape
;
...
...
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
浏览文件 @
2c4fcaa6
...
...
@@ -14,7 +14,12 @@
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
...
@@ -24,39 +29,51 @@ namespace tensorrt {
namespace
plugin
{
class
PReluPlugin
:
public
PluginTensorRT
{
TensorRTEngine
::
Weight
alpha_
;
std
::
vector
<
float
>
weight_
;
float
*
p_gpu_weight_
;
std
::
string
mode_
;
protected:
size_t
getSerializationSize
()
override
{
// return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
return
0
;
return
getBaseSerializationSize
()
+
SerializedSize
(
mode_
.
c_str
())
+
SerializedSize
(
weight_
)
+
SerializedSize
(
getPluginType
())
;
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void
serialize
(
void
*
buffer
)
override
{
// serializeBase(buffer);
// SerializeValue(&buffer, alpha_);
// SerializeValue(&buffer, mode_);
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
weight_
);
SerializeValue
(
&
buffer
,
mode_
.
c_str
());
}
public:
PReluPlugin
(
TensorRTEngine
::
Weight
const
&
alpha
,
std
::
string
const
&
mode
)
:
alpha_
(
alpha
),
mode_
(
mode
)
{}
PReluPlugin
(
const
float
*
weight
,
const
int
weight_num
,
std
::
string
const
&
mode
)
:
mode_
(
mode
)
{
weight_
.
resize
(
weight_num
);
std
::
copy
(
weight
,
weight
+
weight_num
,
weight_
.
data
());
}
// It was used for tensorrt deserialization.
// It should not be called by users.
PReluPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
// deserializeBase(serialData, serialLength);
// DeserializeValue(&serialData, &serialLength, &alpha_);
// DeserializeValue(&serialData, &serialLength, &mode_);
deserializeBase
(
serialData
,
serialLength
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
weight_
);
const
char
*
prelu_mode
;
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
prelu_mode
);
mode_
=
std
::
string
(
prelu_mode
);
}
~
PReluPlugin
()
{
cudaFree
(
p_gpu_weight_
);
}
int
initialize
()
override
;
PReluPlugin
*
clone
()
const
override
{
return
new
PReluPlugin
(
alpha_
,
mode_
);
}
PReluPlugin
*
clone
()
const
override
{
return
new
PReluPlugin
(
weight_
.
data
(),
weight_
.
size
(),
mode_
);
}
const
char
*
getPluginType
()
const
override
{
return
"prelu"
;
}
const
char
*
getPluginType
()
const
override
{
return
"prelu
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
2c4fcaa6
...
...
@@ -15,12 +15,18 @@
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
SplitPlugin
*
CreateSplitPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
SplitPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"split_plugin"
,
CreateSplitPluginDeserialize
);
// copied from operators::math::SplitFunctor
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
2c4fcaa6
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
...
@@ -25,6 +26,7 @@ namespace plugin {
class
SplitPlugin
:
public
PluginTensorRT
{
public:
SplitPlugin
()
{}
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
...
...
@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
const
char
*
getPluginType
()
const
override
{
return
"split
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
...
...
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
protected:
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
output_length
_
)
+
getBaseSerializationSize
();
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
axis
_
)
+
SerializedSize
(
output_length_
)
+
getBaseSerializationSize
();
}
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
output_length_
);
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
浏览文件 @
2c4fcaa6
...
...
@@ -17,9 +17,10 @@
#include <NvInfer.h>
#include <cstring>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/
serialize
.h"
#include "paddle/fluid/inference/tensorrt/plugin/
trt_plugin_utils
.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -30,6 +31,13 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
class
PluginTensorRT
;
typedef
std
::
function
<
PluginTensorRT
*
(
const
void
*
,
size_t
)
>
PluginDeserializeFunc
;
typedef
std
::
function
<
PluginTensorRT
*
(
void
)
>
PluginConstructFunc
;
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
public:
PluginTensorRT
()
{}
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
0 → 100644
浏览文件 @
2c4fcaa6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
PluginTensorRT
*
PluginFactoryTensorRT
::
createPlugin
(
const
char
*
layer_name
,
const
void
*
serial_data
,
size_t
serial_length
)
{
const
char
*
plugin_type
;
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
plugin_type
);
PADDLE_ENFORCE
(
Has
(
plugin_type
),
"trt plugin type %s does not exists, check it."
,
plugin_type
);
auto
plugin
=
plugin_registry_
[
plugin_type
](
serial_data
,
serial_length
);
owned_plugins_
.
emplace_back
(
plugin
);
return
plugin
;
}
bool
PluginFactoryTensorRT
::
RegisterPlugin
(
const
std
::
string
&
op_name
,
PluginDeserializeFunc
deserialize_func
)
{
if
(
Has
(
op_name
))
return
false
;
auto
ret
=
plugin_registry_
.
emplace
(
op_name
,
deserialize_func
);
return
ret
.
second
;
}
void
PluginFactoryTensorRT
::
DestroyPlugins
()
{
owned_plugins_
.
clear
();
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
0 → 100644
浏览文件 @
2c4fcaa6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <NvInfer.h>
#include <cstring>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PluginFactoryTensorRT
:
public
nvinfer1
::
IPluginFactory
,
public
DeleteHelper
{
public:
// Deserialization method
PluginTensorRT
*
createPlugin
(
const
char
*
layer_name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
;
bool
RegisterPlugin
(
const
std
::
string
&
op_name
,
PluginDeserializeFunc
deserialize_func
);
bool
Has
(
const
std
::
string
&
op_name
)
{
return
plugin_registry_
.
find
(
op_name
)
!=
plugin_registry_
.
end
();
}
void
DestroyPlugins
();
protected:
std
::
unordered_map
<
std
::
string
,
PluginDeserializeFunc
>
plugin_registry_
;
std
::
list
<
std
::
unique_ptr
<
PluginTensorRT
>>
owned_plugins_
;
};
class
TrtPluginRegistrar
{
public:
TrtPluginRegistrar
(
const
std
::
string
&
name
,
PluginDeserializeFunc
deserialize_func
)
{
inference
::
Singleton
<
PluginFactoryTensorRT
>::
Global
().
RegisterPlugin
(
name
,
deserialize_func
);
}
};
#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \
trt_plugin_registrar##ctr __attribute__((unused)) = \
paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
name, deserialize_func)
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/
serialize
.h
→
paddle/fluid/inference/tensorrt/plugin/
trt_plugin_utils
.h
浏览文件 @
2c4fcaa6
...
...
@@ -13,8 +13,8 @@
// limitations under the License.
#pragma once
#include <cstring>
#include <string>
#include <type_traits>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -24,6 +24,13 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
// Some trt base classes lack of the destructor.
// We use a assisted class to fix this.
struct
DeleteHelper
{
protected:
virtual
~
DeleteHelper
()
{}
};
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
...
...
paddle/fluid/inference/tensorrt/test_engine.cc
浏览文件 @
2c4fcaa6
...
...
@@ -17,6 +17,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -27,19 +29,34 @@ namespace tensorrt {
class
TensorRTEngineTest
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream_
));
engine_
=
new
TensorRTEngine
(
10
,
1
<<
10
,
stream_
);
ctx_
=
new
platform
::
CUDADeviceContext
(
platform
::
CUDAPlace
(
0
));
engine_
=
new
TensorRTEngine
(
10
,
1
<<
10
);
engine_
->
InitNetwork
();
}
void
TearDown
()
override
{
if
(
engine_
)
{
delete
engine_
;
cudaStreamDestroy
(
stream_
);
engine_
=
nullptr
;
}
}
void
PrepareInputOutput
(
const
std
::
vector
<
float
>
&
input
,
std
::
vector
<
int
>
output_shape
)
{
TensorFromVector
(
input
,
*
ctx_
,
&
input_
);
output_
.
Resize
(
framework
::
make_ddim
(
output_shape
));
}
void
GetOutput
(
std
::
vector
<
float
>
*
output
)
{
TensorToVector
(
output_
,
*
ctx_
,
output
);
}
protected:
TensorRTEngine
*
engine_
;
cudaStream_t
stream_
;
framework
::
Tensor
input_
;
framework
::
Tensor
output_
;
TensorRTEngine
*
engine_
;
platform
::
CUDADeviceContext
*
ctx_
;
};
TEST_F
(
TensorRTEngineTest
,
add_layer
)
{
...
...
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
float
raw_weight
[
size
]
=
{
2.
};
// Weight in CPU memory.
float
raw_bias
[
size
]
=
{
3.
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
LOG
(
INFO
)
<<
"create weights"
;
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
size
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
size
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
1
,
1
});
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
size
,
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
size
,
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
...
...
@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
// fill in real data
float
x_v
=
1234
;
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
1
*
sizeof
(
float
));
std
::
vector
<
float
>
x_v
=
{
1234
};
std
::
vector
<
float
>
y_cpu
;
PrepareInputOutput
(
x_v
,
{
1
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
LOG
(
INFO
)
<<
"to execute"
;
engine_
->
Execute
(
1
);
engine_
->
Execute
(
1
,
&
buffers
,
ctx_
->
stream
()
);
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
;
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
,
1
*
sizeof
(
float
));
GetOutput
(
&
y_cpu
);
LOG
(
INFO
)
<<
"to checkout output"
;
ASSERT_EQ
(
y_cpu
,
x_v
*
2
+
3
);
ASSERT_EQ
(
y_cpu
[
0
],
x_v
[
0
]
*
2
+
3
);
}
TEST_F
(
TensorRTEngineTest
,
add_layer_multi_dim
)
{
...
...
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
// instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
float
raw_weight
[
4
]
=
{
1.0
,
1.1
,
3.3
,
4.4
};
float
raw_bias
[
2
]
=
{
1.3
,
2.4
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
4
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
2
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
2
,
1
});
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
2
,
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
2
,
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
...
...
@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
2
]
=
{
1.0
,
2.0
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
2
*
sizeof
(
float
));
engine_
->
Execute
(
1
);
// fill in real data
std
::
vector
<
float
>
x_v
=
{
1.0
,
2.0
};
std
::
vector
<
float
>
y_cpu
;
PrepareInputOutput
(
x_v
,
{
2
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
1
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
[
2
]
=
{
-
1.
,
-
1.
}
;
GetOutput
(
&
y_cpu
)
;
auto
dims
=
engine_
->
GetITensor
(
"y"
)
->
getDimensions
();
ASSERT_EQ
(
dims
.
nbDims
,
3
);
ASSERT_EQ
(
dims
.
d
[
0
],
2
);
ASSERT_EQ
(
dims
.
d
[
1
],
1
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
2
*
sizeof
(
float
));
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
}
...
...
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
// Weight in CPU memory.
float
raw_weight
[
9
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
float
raw_bias
[
1
]
=
{
0
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
9
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
1
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
3
,
3
});
auto
*
conv_layer
=
auto
*
conv_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
x
,
1
,
nvinfer1
::
DimsHW
{
3
,
3
},
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
conv_layer
!=
nullptr
);
...
...
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
18
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
// fill in real data
std
::
vector
<
float
>
x_v
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
18
*
sizeof
(
float
));
engine_
->
Execute
(
2
);
std
::
vector
<
float
>
y_cpu
;
PrepareInputOutput
(
x_v
,
{
18
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
2
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
18
]
;
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
18
*
sizeof
(
float
));
GetOutput
(
&
y_cpu
)
;
ASSERT_EQ
(
y_cpu
[
0
],
4.0
);
ASSERT_EQ
(
y_cpu
[
1
],
6.0
);
}
TEST_F
(
TensorRTEngineTest
,
test_pool2d
)
{
// Weight in CPU memory.
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
2
,
2
});
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
nvinfer1
::
PoolingType
pool_t
=
nvinfer1
::
PoolingType
::
kAVERAGE
;
auto
*
pool_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
x
),
pool_t
,
nvinfer1
::
DimsHW
{
2
,
2
});
auto
*
pool_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
x
,
pool_t
,
nvinfer1
::
DimsHW
{
2
,
2
});
PADDLE_ENFORCE
(
pool_layer
!=
nullptr
);
pool_layer
->
setStride
(
nvinfer1
::
DimsHW
{
1
,
1
});
...
...
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
8
]
=
{
1.0
,
2.0
,
5.0
,
0.0
,
2.0
,
3.0
,
5.0
,
10.0
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
8
*
sizeof
(
float
));
engine_
->
Execute
(
2
);
// fill in real data
std
::
vector
<
float
>
x_v
=
{
1.0
,
2.0
,
5.0
,
0.0
,
2.0
,
3.0
,
5.0
,
10.0
};
std
::
vector
<
float
>
y_cpu
;
PrepareInputOutput
(
x_v
,
{
2
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
2
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
2
];
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
2
*
sizeof
(
float
));
GetOutput
(
&
y_cpu
);
ASSERT_EQ
(
y_cpu
[
0
],
2.0
);
ASSERT_EQ
(
y_cpu
[
1
],
5.0
);
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
2c4fcaa6
...
...
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
for
(
int
tid
=
1
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
&
,
tid
]
{
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
base_predictor
->
Clone
();
auto
&
predictor
=
predictors
[
tid
];
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
浏览文件 @
2c4fcaa6
...
...
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
false
);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
for
(
int
tid
=
1
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
&
,
tid
]
{
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
base_predictor
->
Clone
();
auto
&
predictor
=
predictors
[
tid
];
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
inputs
;
PrepareZeroCopyInputs
(
predictor
,
&
inputs
);
auto
output_tensor
=
predictor
->
GetOutputTensor
(
out_var_name
);
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
2c4fcaa6
...
...
@@ -17,8 +17,10 @@
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include <vector>
#ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h>
...
...
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
std
::
vector
<
std
::
thread
>
threads
;
auto
main_predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreateTestPredictor
(
config
,
use_analysis
));
for
(
int
tid
=
1
;
tid
<
num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
size_t
total_time
{
0
};
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
...
...
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std
::
vector
<
PaddleTensor
>
outputs_tid
;
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
main_predictor
->
Clone
();
auto
&
predictor
=
predictors
[
tid
];
#ifdef PADDLE_WITH_MKLDNN
if
(
use_analysis
)
{
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
2c4fcaa6
...
...
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
if
(
use_gpu
)
{
config
->
EnableUseGpu
(
100
,
0
);
if
(
use_tensorrt
)
{
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
);
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
,
3
,
AnalysisConfig
::
Precision
::
kFloat32
,
false
);
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
config
->
pass_builder
()
->
TurnOnDebug
();
...
...
paddle/fluid/memory/allocation/allocator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
bool
Allocator
::
IsAllocThreadSafe
()
const
{
return
false
;
}
AllocationPtr
Allocator
::
Allocate
(
size_t
size
,
Allocator
::
Attr
attr
)
{
VLOG
(
2
)
<<
"Alloc allocation on "
<<
typeid
(
*
this
).
name
();
auto
ptr
=
AllocateImpl
(
size
,
attr
);
ptr
->
RegisterAllocatorChain
(
this
);
VLOG
(
2
)
<<
"Alloc success"
;
return
AllocationPtr
(
ptr
);
}
void
Allocator
::
FreeImpl
(
Allocation
*
allocation
)
{
auto
*
allocator
=
allocation
->
TopAllocator
();
Allocator
*
allocator
=
allocation
->
TopAllocator
();
allocator
->
Free
(
allocation
);
}
void
Allocator
::
Free
(
Allocation
*
allocation
)
{
VLOG
(
2
)
<<
"Free allocation on "
<<
typeid
(
*
this
).
name
();
allocation
->
PopAllocator
();
FreeImpl
(
allocation
);
}
...
...
@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
const
char
*
BadAlloc
::
what
()
const
noexcept
{
return
msg_
.
c_str
();
}
void
AllocationDeleter
::
operator
()(
Allocation
*
allocation
)
const
{
auto
*
allocator
=
allocation
->
TopAllocator
();
Allocator
*
allocator
=
allocation
->
TopAllocator
();
allocator
->
Free
(
allocation
);
}
...
...
paddle/fluid/memory/allocation/allocator.h
浏览文件 @
2c4fcaa6
...
...
@@ -16,7 +16,7 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/
small
_stack.h"
#include "paddle/fluid/framework/
inlined
_stack.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
...
...
@@ -82,7 +82,7 @@ class Allocation {
std
::
vector
<
Allocator
*>
GetAllocatorChain
()
const
{
std
::
vector
<
Allocator
*>
allocators
;
for
(
size_t
i
=
0
;
i
<
allocator_chain_
.
size
();
++
i
)
{
allocators
[
i
]
=
allocator_chain_
[
i
]
;
allocators
.
push_back
(
allocator_chain_
[
i
])
;
}
return
allocators
;
}
...
...
@@ -100,7 +100,7 @@ class Allocation {
void
*
ptr_
;
size_t
size_
;
platform
::
Place
place_
;
framework
::
Small
Stack
<
Allocator
*
,
8
>
allocator_chain_
;
framework
::
Inlined
Stack
<
Allocator
*
,
8
>
allocator_chain_
;
friend
class
Allocator
;
friend
class
AllocationDeleter
;
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"during unit testing."
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
initial_gpu_memory_in_mb
);
DECLARE_double
(
reallocate_gpu_memory_in_mb
);
DECLARE_bool
(
benchmark
);
namespace
paddle
{
...
...
@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
a
=
new
detail
::
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
platform
::
CpuMinChunkSize
(),
platform
::
CpuMaxChunkSize
());
platform
::
CpuMinChunkSize
(),
platform
::
CpuMaxChunkSize
(),
platform
::
CpuMaxChunkSize
());
});
return
a
;
...
...
@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
static
std
::
once_flag
init_flag
;
static
detail
::
BuddyAllocator
**
a_arr
=
nullptr
;
static
std
::
vector
<
int
>
devices
;
std
::
call_once
(
init_flag
,
[
gpu_id
]()
{
devices
=
platform
::
GetSelectedDevices
();
int
gpu_num
=
devices
.
size
();
allocation
::
GPUMemMonitor
.
Initialize
(
devices
.
size
());
class
GPUBuddyAllocatorList
{
public:
GPUBuddyAllocatorList
()
:
allocators_
(
platform
::
GetCUDADeviceCount
()),
flags_
(
platform
::
GetCUDADeviceCount
())
{
allocation
::
GPUMemMonitor
.
Initialize
(
allocators_
.
size
());
}
a_arr
=
new
BuddyAllocator
*
[
gpu_num
];
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
{
int
dev_id
=
devices
[
i
];
a_arr
[
i
]
=
nullptr
;
BuddyAllocator
*
Get
(
size_t
dev_id
)
{
PADDLE_ENFORCE
(
dev_id
<
flags_
.
size
(),
"Invalid device id %s"
,
dev_id
);
std
::
call_once
(
flags_
[
dev_id
],
[
this
,
dev_id
]
{
platform
::
SetDeviceId
(
dev_id
);
a_arr
[
i
]
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
size_t
first_size
=
platform
::
GpuFirstAllocateChunkSize
();
size_t
re_size
=
platform
::
GpuReAllocateChunkSize
();
allocators_
[
dev_id
]
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
GPUAllocator
(
dev_id
)),
platform
::
GpuMinChunkSize
(),
platform
::
GpuMaxChunkSize
());
VLOG
(
10
)
<<
"
\n\n
NOTE: each GPU device use "
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
platform
::
GpuMinChunkSize
(),
first_size
,
re_size
);
VLOG
(
2
)
<<
"
\n\n
NOTE: each GPU device use "
<<
string
::
HumanReadableSize
(
first_size
)
<<
"(initial chunk) "
<<
string
::
HumanReadableSize
(
re_size
)
<<
"(reallocate chunk) "
<<
"% of GPU memory.
\n
"
<<
"You can set GFlags environment variable '"
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
}
<<
"' or "
"'FLAGS_initial_gpu_memory_in_mb/"
"FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
"of GPU usage.
\n\n
"
;
VLOG
(
2
)
<<
"Currently, FLAGS_fraction_of_gpu_memory_to_use="
<<
FLAGS_fraction_of_gpu_memory_to_use
<<
", "
<<
"FLAGS_initial_gpu_memory_in_mb="
<<
FLAGS_initial_gpu_memory_in_mb
<<
", "
<<
"FLAGS_reallocate_gpu_memory_in_mb="
<<
FLAGS_reallocate_gpu_memory_in_mb
;
});
return
allocators_
[
dev_id
];
}
private:
std
::
vector
<
BuddyAllocator
*>
allocators_
;
std
::
vector
<
std
::
once_flag
>
flags_
;
};
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
static
GPUBuddyAllocatorList
allocators
;
platform
::
SetDeviceId
(
gpu_id
);
auto
pos
=
std
::
distance
(
devices
.
begin
(),
std
::
find
(
devices
.
begin
(),
devices
.
end
(),
gpu_id
));
return
a_arr
[
pos
];
return
allocators
.
Get
(
gpu_id
);
}
#endif
...
...
@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
if
(
ptr
==
nullptr
&&
size
>
0
)
{
int
cur_dev
=
platform
::
GetCurrentDeviceId
();
platform
::
SetDeviceId
(
place
.
device
);
size_t
avail
,
total
;
...
...
@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CUDAPinnedAllocator
),
platform
::
CUDAPinnedMinChunkSize
(),
platform
::
CUDAPinnedMaxChunkSize
(),
platform
::
CUDAPinnedMaxChunkSize
());
});
...
...
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -14,16 +14,90 @@
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include <algorithm>
#include <cctype>
#include <fstream>
#include <limits>
#include <sstream>
#include <string>
#include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double
(
tolerant_times
,
2
,
"Tolerant memory size times of buffered_allocator"
);
DEFINE_string
(
division_plan_path
,
""
,
"Division plan file path"
);
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
std
::
string
TrimStringAndToLowerCase
(
const
std
::
string
&
str
)
{
auto
not_space
=
[](
char
ch
)
{
return
std
::
isspace
(
ch
)
==
0
;
};
auto
first_idx
=
static_cast
<
size_t
>
(
std
::
find_if
(
str
.
begin
(),
str
.
end
(),
not_space
)
-
str
.
begin
());
auto
last_idx
=
static_cast
<
size_t
>
(
std
::
find_if
(
str
.
rbegin
(),
str
.
rend
(),
not_space
)
-
str
.
rbegin
());
if
(
first_idx
==
str
.
size
()
||
last_idx
==
str
.
size
())
return
""
;
last_idx
=
str
.
size
()
-
1
-
last_idx
;
auto
ret
=
str
.
substr
(
first_idx
,
last_idx
-
first_idx
);
std
::
for_each
(
ret
.
begin
(),
ret
.
end
(),
[](
char
&
ch
)
{
ch
=
std
::
tolower
(
ch
);
});
return
ret
;
}
static
size_t
ParseStringToBytes
(
const
std
::
string
&
str
)
{
std
::
string
ret
=
str
;
if
(
ret
.
back
()
==
'b'
)
{
ret
.
pop_back
();
}
PADDLE_ENFORCE
(
!
ret
.
empty
(),
"Wrong format: %s"
,
str
);
size_t
multiples
=
1
;
switch
(
ret
.
back
())
{
case
'g'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
30
);
break
;
case
'm'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
20
);
break
;
case
'k'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
10
);
break
;
default:
break
;
}
if
(
multiples
!=
1
)
ret
.
pop_back
();
ret
=
TrimStringAndToLowerCase
(
ret
);
double
ret_val
=
0.0
;
std
::
stringstream
ss
(
ret
);
PADDLE_ENFORCE
((
ss
>>
ret_val
).
good
(),
"Wrong format %s"
,
str
);
return
static_cast
<
size_t
>
(
ret_val
*
multiples
);
}
static
std
::
string
GetDebugStringOfPlan
(
const
std
::
vector
<
size_t
>
&
plan
)
{
std
::
string
ret
(
"["
);
for
(
auto
sz
:
plan
)
{
ret
+=
string
::
HumanReadableSize
(
sz
);
ret
+=
", "
;
}
return
ret
+
"]"
;
}
static
std
::
vector
<
size_t
>
ReadDivisionPlanFromFile
(
const
std
::
string
&
filepath
)
{
std
::
ifstream
is
(
filepath
.
c_str
());
PADDLE_ENFORCE
(
is
.
good
(),
"File not exist"
);
std
::
string
str
;
std
::
vector
<
size_t
>
plan
;
while
(
std
::
getline
(
is
,
str
).
good
())
{
str
=
TrimStringAndToLowerCase
(
str
);
if
(
str
.
empty
())
break
;
plan
.
push_back
(
ParseStringToBytes
(
str
));
}
return
plan
;
}
static
void
CheckAndModifyMemoryDivisionPlan
(
std
::
vector
<
size_t
>
*
division_plan
)
{
// Check whether the division plan is strictly sorted
...
...
@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
}
static
std
::
vector
<
size_t
>
GetDefaultDivisionPlan
()
{
if
(
!
FLAGS_division_plan_path
.
empty
())
{
return
ReadDivisionPlanFromFile
(
FLAGS_division_plan_path
);
}
constexpr
size_t
kMaxLogSize
=
30
;
std
::
vector
<
size_t
>
plan
;
for
(
size_t
i
=
12
;
i
<=
kMaxLogSize
;
++
i
)
{
plan
.
push_back
(
static_cast
<
size_t
>
(
1
)
<<
i
);
}
/*
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
*/
return
plan
;
}
...
...
@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)),
division_plan_
(
division_plan
)
{
CheckAndModifyMemoryDivisionPlan
(
&
division_plan_
);
allocations_
.
resize
(
division_plan_
.
size
());
mtx_
.
resize
(
division_plan_
.
size
());
allocations_
.
resize
(
division_plan_
.
size
()
-
1
);
mtx_
.
resize
(
division_plan_
.
size
()
-
1
);
if
(
underlying_allocator_
->
IsAllocThreadSafe
())
{
for
(
auto
&
mtx
:
mtx_
)
{
mtx
.
reset
(
new
std
::
mutex
());
}
}
VLOG
(
1
)
<<
"Division plan is: "
<<
GetDebugStringOfPlan
(
division_plan_
);
VLOG
(
1
)
<<
"FLAGS_tolerant_times = "
<<
FLAGS_tolerant_times
;
}
void
MultiBinBufferedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
allocation
->
size
());
{
if
(
bin_index
<
allocations_
.
size
())
{
platform
::
LockGuardPtr
<
std
::
mutex
>
guard
(
mtx_
[
bin_index
]);
allocations_
[
bin_index
].
emplace
(
allocation
->
size
(),
AllocationPtr
(
allocation
));
}
else
{
underlying_allocator_
->
Free
(
allocation
);
}
}
void
MultiBinBufferedAllocator
::
FreeCache
(
size_t
size
,
size_t
bin_index
)
{
// bin_index is not used currently.
// Maybe we can design more flexible FreeCache strategy based on bin_index
size_t
MultiBinBufferedAllocator
::
FreeCache
(
size_t
size
,
size_t
bin_index
)
{
size_t
accumulated_size
=
0
;
// FIXME(zjl): free the largest first when there is no extra
for
(
size_t
i
=
allocations_
.
size
()
-
1
;
i
!=
static_cast
<
size_t
>
(
-
1
);
--
i
)
{
...
...
@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
underlying_allocator_
->
Free
(
it
->
second
.
release
());
allocations_
[
i
].
erase
(
it
--
);
if
(
accumulated_size
>=
size
)
{
return
;
return
accumulated_size
;
}
}
while
(
!
allocations_
[
i
].
empty
());
}
return
accumulated_size
;
}
Allocation
*
MultiBinBufferedAllocator
::
AllocateImpl
(
size_t
size
,
Attr
attr
)
{
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
size
);
auto
upper_size
=
TolerantUpperSize
(
size
);
for
(;
upper_size
>=
division_plan_
[
bin_index
];
++
bin_index
)
{
// if (bin_index >= allocations_.size()) {
// VLOG(2) << "Allocate " << size << " from underlying directly";
//}
for
(;
bin_index
<
allocations_
.
size
()
&&
upper_size
>=
division_plan_
[
bin_index
];
++
bin_index
)
{
auto
&
allocation
=
allocations_
[
bin_index
];
platform
::
LockGuardPtr
<
std
::
mutex
>
lock
(
mtx_
[
bin_index
]);
auto
it
=
allocation
.
lower_bound
(
size
);
if
(
it
!=
allocation
.
end
()
&&
it
->
second
->
size
()
<
upper_size
)
{
if
(
it
!=
allocation
.
end
()
&&
it
->
second
->
size
()
<=
upper_size
)
{
size_t
sz
=
it
->
second
->
size
();
auto
ret
=
std
::
move
(
it
->
second
);
allocation
.
erase
(
it
);
VLOG
(
3
)
<<
"Allocate "
<<
sz
<<
"(required "
<<
size
<<
") from cache directly"
;
return
ret
.
release
();
}
}
size_t
retry_time
=
1
;
while
(
true
)
{
try
{
return
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
auto
ret
=
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
VLOG
(
2
)
<<
"Allocate "
<<
size
<<
" from underlying directly"
;
return
ret
;
}
catch
(
BadAlloc
&
)
{
VLOG
(
2
)
<<
"BadAlloc raises, try to free "
<<
size
<<
" caches"
;
FreeCache
(
size
,
bin_index
);
return
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
VLOG
(
1
)
<<
retry_time
<<
"-th BadAlloc raises, try to free "
<<
size
<<
" bytes caches"
;
// size_t actual_free_size = FreeCache(size, bin_index);
size_t
actual_free_size
=
FreeCache
(
-
1UL
,
bin_index
);
VLOG
(
1
)
<<
retry_time
<<
"-th free "
<<
actual_free_size
<<
" bytes caches"
;
if
(
actual_free_size
==
0
)
throw
;
}
++
retry_time
;
}
}
...
...
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
浏览文件 @
2c4fcaa6
...
...
@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
void
FreeImpl
(
Allocation
*
allocation
)
override
;
private:
void
FreeCache
(
size_t
size
,
size_t
bin_index
);
size_t
FreeCache
(
size_t
size
,
size_t
bin_index
);
std
::
shared_ptr
<
Allocator
>
underlying_allocator_
;
std
::
vector
<
std
::
multimap
<
size_t
,
AllocationPtr
>>
allocations_
;
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -25,9 +25,11 @@ namespace detail {
BuddyAllocator
::
BuddyAllocator
(
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
size_t
min_chunk_size
,
size_t
max
_chunk_size
)
size_t
first_allocate_chunk_size
,
size_t
reallocate
_chunk_size
)
:
min_chunk_size_
(
min_chunk_size
),
max_chunk_size_
(
max_chunk_size
),
first_allocate_chunk_size_
(
first_allocate_chunk_size
),
reallocate_chunk_size_
(
reallocate_chunk_size
),
max_chunk_size_
(
first_allocate_chunk_size
),
cache_
(
system_allocator
->
UseGpu
()),
system_allocator_
(
std
::
move
(
system_allocator
))
{}
...
...
@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed"
;
while
(
!
pool_
.
empty
())
{
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool_
.
begin
()));
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
auto
desc
=
cache_
.
load
(
block
);
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
desc
.
size
<<
")"
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
)
);
system_allocator_
->
Free
(
block
,
desc
.
size
,
desc
.
index
);
cache_
.
invalidate
(
block
);
pool_
.
erase
(
pool_
.
begin
());
}
...
...
@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// if the allocation is huge, send directly to the system allocator
if
(
size
>
max_chunk_size_
)
{
VLOG
(
10
)
<<
"Allocate from system allocator."
;
return
SystemAlloc
(
size
);
return
SystemAlloc
(
size
,
false
);
}
// query and allocate from the existing chunk
...
...
@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
if
(
it
==
pool_
.
end
())
{
it
=
RefillPool
();
// if still failure,
fail fatally
// if still failure,
try to allocate from SystemAllocator
if
(
it
==
pool_
.
end
())
{
return
nullptr
;
return
SystemAlloc
(
size
,
false
)
;
}
}
else
{
VLOG
(
10
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
...
...
@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {
VLOG
(
10
)
<<
"Free from address "
<<
block
;
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
HUGE_CHUNK
)
{
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
UNMANAGED_
HUGE_CHUNK
)
{
VLOG
(
10
)
<<
"Free directly from system allocator"
;
system_allocator_
->
Free
(
block
,
block
->
total_size
(
cache_
),
block
->
index
(
cache_
));
...
...
@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {
size_t
BuddyAllocator
::
Used
()
{
return
total_used_
;
}
size_t
BuddyAllocator
::
GetMinChunkSize
()
{
return
min_chunk_size_
;
}
size_t
BuddyAllocator
::
GetMaxChunkSize
()
{
return
max_chunk_size_
;
}
size_t
BuddyAllocator
::
GetMaxChunkSize
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
max_chunk_size_
;
}
void
*
BuddyAllocator
::
SystemAlloc
(
size_t
size
)
{
void
*
BuddyAllocator
::
SystemAlloc
(
size_t
size
,
bool
is_managed
)
{
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
size
);
...
...
@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
if
(
p
==
nullptr
)
return
nullptr
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
HUGE_CHUNK
,
index
,
size
,
nullptr
,
nullptr
);
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
is_managed
?
MemoryBlock
::
MANAGED_HUGE_CHUNK
:
MemoryBlock
::
UNMANAGED_HUGE_CHUNK
,
index
,
size
,
nullptr
,
nullptr
);
return
static_cast
<
MemoryBlock
*>
(
p
)
->
data
();
}
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
RefillPool
()
{
#ifdef PADDLE_WITH_CUDA
if
(
system_allocator_
->
UseGpu
())
{
if
((
total_used_
+
total_free_
)
==
0
)
{
// Compute the maximum allocation size for the first allocation.
max_chunk_size_
=
platform
::
GpuMaxChunkSize
();
}
if
(
total_used_
+
total_free_
>
0
)
{
max_chunk_size_
=
reallocate_chunk_size_
;
}
#endif
// Allocate a new maximum sized block
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
max_chunk_size_
);
size_t
chunk_size
=
max_chunk_size_
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
chunk_size
);
if
(
p
==
nullptr
)
return
pool_
.
end
();
...
...
@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<<
" from system allocator"
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
FREE_CHUNK
,
index
,
max_chunk_size_
,
nullptr
,
nullptr
);
chunk_size
,
nullptr
,
nullptr
);
// gpu fallback allocation
if
(
system_allocator_
->
UseGpu
()
&&
...
...
@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_
++
;
}
total_free_
+=
max_chunk_size_
;
total_free_
+=
chunk_size
;
// dump the block into pool
return
pool_
.
insert
(
IndexSizeAddress
(
index
,
max_chunk_size_
,
p
)).
first
;
return
pool_
.
insert
(
IndexSizeAddress
(
index
,
chunk_size
,
p
)).
first
;
}
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
FindExistChunk
(
size_t
size
)
{
...
...
@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
void
BuddyAllocator
::
CleanIdleFallBackAlloc
()
{
// If fallback allocation does not exist, return directly
if
(
!
fallback_alloc_count_
)
return
;
if
(
!
fallback_alloc_count_
||
!
system_allocator_
->
UseGpu
()
)
return
;
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
// If free memory block less than max_chunk_size_, return directly
if
(
std
::
get
<
1
>
(
*
pool
)
<
max_chunk_size_
)
return
;
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
// If no GPU fallback allocator, return
if
(
!
system_allocator_
->
UseGpu
()
||
block
->
index
(
cache_
)
==
0
)
{
auto
desc
=
cache_
.
load
(
block
);
if
(
desc
.
index
==
0
)
{
return
;
}
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
system_allocator_
->
Free
(
block
,
desc
.
size
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
total_free_
-=
max_chunk_size_
;
total_free_
-=
desc
.
size
;
fallback_alloc_count_
--
;
// If no fall allocation exists, return directly
...
...
@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
if
(
!
shall_free_alloc
())
return
;
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
// If free memory block less than max_chunk_size_, return directly
if
(
std
::
get
<
1
>
(
*
pool
)
<
max_chunk_size_
)
return
;
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
auto
desc
=
cache_
.
load
(
block
);
if
(
desc
.
type
!=
MemoryBlock
::
MANAGED_HUGE_CHUNK
)
{
return
;
}
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
)
);
system_allocator_
->
Free
(
block
,
desc
.
size
,
desc
.
index
);
cache_
.
invalidate
(
block
);
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
total_free_
-=
max_chunk_size_
;
total_free_
-=
desc
.
size
;
if
(
!
shall_free_alloc
())
return
;
}
...
...
paddle/fluid/memory/detail/buddy_allocator.h
浏览文件 @
2c4fcaa6
...
...
@@ -34,7 +34,8 @@ namespace detail {
class
BuddyAllocator
{
public:
BuddyAllocator
(
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
size_t
min_chunk_size
,
size_t
max_chunk_size
);
size_t
min_chunk_size
,
size_t
first_allocate_chunk_size
,
size_t
reallocate_chunk_size
);
~
BuddyAllocator
();
...
...
@@ -57,7 +58,7 @@ class BuddyAllocator {
using
PoolSet
=
std
::
set
<
IndexSizeAddress
>
;
/*! \brief Allocate fixed-size memory from system */
void
*
SystemAlloc
(
size_t
size
);
void
*
SystemAlloc
(
size_t
size
,
bool
is_managed
=
true
);
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet
::
iterator
RefillPool
();
...
...
@@ -87,7 +88,11 @@ class BuddyAllocator {
size_t
total_free_
=
0
;
// the total size of free memory
size_t
min_chunk_size_
;
// the minimum size of each chunk
size_t
max_chunk_size_
;
// the maximum size of each chunk
size_t
first_allocate_chunk_size_
;
size_t
reallocate_chunk_size_
;
size_t
max_chunk_size_
;
private:
/**
...
...
paddle/fluid/memory/detail/memory_block.h
浏览文件 @
2c4fcaa6
...
...
@@ -29,7 +29,8 @@ struct MemoryBlock {
enum
Type
{
FREE_CHUNK
,
// memory is free and idle
ARENA_CHUNK
,
// memory is being occupied
HUGE_CHUNK
,
// memory is out of management
MANAGED_HUGE_CHUNK
,
// memory is huge and out of management
UNMANAGED_HUGE_CHUNK
,
// memory is huge and managed by allocator
INVALID_CHUNK
// memory is invalid
};
...
...
paddle/fluid/operators/benchmark/op_tester.cc
浏览文件 @
2c4fcaa6
...
...
@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
// Initialize the OpDesc
if
(
op_desc_info
.
Has
(
config_
.
op_type
))
{
type_
=
config_
.
op_type
;
op_desc_
.
SetType
(
config_
.
op_type
);
CreateOpDesc
();
CreateInputVarDesc
();
CreateOutputVarDesc
();
}
else
{
...
...
@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
return
output_names
;
}
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
OpTester
::
GetOpProtoAttrNames
()
{
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
attr_types
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
const
std
::
vector
<
std
::
string
>
skipped_attrs
=
{
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpNamescopeAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
()};
for
(
int
i
=
0
;
i
!=
proto
.
attrs_size
();
++
i
)
{
const
auto
&
attr
=
proto
.
attrs
(
i
);
if
(
!
Has
(
skipped_attrs
,
attr
.
name
()))
{
VLOG
(
4
)
<<
"attr: "
<<
attr
.
name
()
<<
", type: "
<<
attr
.
type
();
attr_types
[
attr
.
name
()]
=
attr
.
type
();
}
}
return
attr_types
;
}
framework
::
proto
::
VarType
::
Type
OpTester
::
TransToVarType
(
std
::
string
str
)
{
if
(
str
==
"int32"
)
{
return
framework
::
proto
::
VarType
::
INT32
;
}
else
if
(
str
==
"int64"
)
{
return
framework
::
proto
::
VarType
::
INT64
;
}
else
if
(
str
==
"fp32"
)
{
return
framework
::
proto
::
VarType
::
FP32
;
}
else
if
(
str
==
"fp64"
)
{
return
framework
::
proto
::
VarType
::
FP64
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s."
,
str
.
c_str
());
}
}
void
OpTester
::
CreateInputVarDesc
()
{
std
::
vector
<
std
::
string
>
input_names
=
GetOpProtoInputNames
();
for
(
auto
&
name
:
input_names
)
{
...
...
@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
var
->
SetDataType
(
TransToVarType
(
input
->
dtype
)
);
var
->
SetShape
(
input
->
dims
);
op_desc_
.
SetInput
(
name
,
{
var_name
});
input
_lods_
[
var_name
]
=
input
->
lod
;
input
s_
[
var_name
]
=
*
input
;
}
}
...
...
@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
}
}
void
OpTester
::
CreateOpDesc
()
{
op_desc_
.
SetType
(
config_
.
op_type
);
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
attr_types
=
GetOpProtoAttrNames
();
for
(
auto
item
:
config_
.
attrs
)
{
const
std
::
string
&
name
=
item
.
first
;
if
(
attr_types
.
find
(
name
)
==
attr_types
.
end
())
{
LOG
(
FATAL
)
<<
"Operator "
<<
type_
<<
" do not have attr "
<<
name
;
}
const
std
::
string
&
value_str
=
item
.
second
;
const
framework
::
proto
::
AttrType
&
type
=
attr_types
[
name
];
switch
(
type
)
{
case
framework
::
proto
::
AttrType
::
BOOLEAN
:
break
;
case
framework
::
proto
::
AttrType
::
INT
:
{
int
value
=
StringTo
<
int
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
{
value
});
}
break
;
case
framework
::
proto
::
AttrType
::
FLOAT
:
{
float
value
=
StringTo
<
float
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
{
value
});
}
break
;
case
framework
::
proto
::
AttrType
::
STRING
:
{
op_desc_
.
SetAttr
(
name
,
{
value_str
});
}
break
;
case
framework
::
proto
::
AttrType
::
BOOLEANS
:
case
framework
::
proto
::
AttrType
::
INTS
:
case
framework
::
proto
::
AttrType
::
FLOATS
:
case
framework
::
proto
::
AttrType
::
STRINGS
:
LOG
(
FATAL
)
<<
"Not supported yet."
;
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
int64_t
value
=
StringTo
<
int64_t
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
value
);
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
type
);
}
}
}
framework
::
VarDesc
*
OpTester
::
Var
(
const
std
::
string
&
name
)
{
auto
it
=
vars_
.
find
(
name
);
if
(
it
!=
vars_
.
end
())
{
...
...
@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
template
<
typename
T
>
void
OpTester
::
SetupTensor
(
framework
::
LoDTensor
*
tensor
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upp
er
)
{
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
,
const
std
::
string
&
initializ
er
)
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
T
*
ptr
=
tensor
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
for
(
int
i
=
0
;
i
<
tensor
->
numel
();
++
i
)
{
ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
else
{
framework
::
LoDTensor
cpu_tensor
;
T
*
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
T
*
cpu_ptr
=
nullptr
;
if
(
!
platform
::
is_cpu_place
(
place_
))
{
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
platform
::
CPUPlace
());
}
else
{
cpu_ptr
=
ptr
;
}
if
(
initializer
==
"random"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
else
if
(
initializer
==
"natural"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
lower
+
i
;
}
}
else
if
(
initializer
==
"zeros"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
0
;
}
}
else
{
PADDLE_THROW
(
"Unsupported initializer %s."
,
initializer
.
c_str
());
}
if
(
!
platform
::
is_cpu_place
(
place_
))
{
TensorCopySync
(
cpu_tensor
,
place_
,
tensor
);
}
}
...
...
@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
}
}
for
(
auto
&
item
:
input
_lod
s_
)
{
for
(
auto
&
item
:
inputs_
)
{
// Allocate memory for input tensor
auto
&
var_name
=
item
.
first
;
VLOG
(
3
)
<<
"Allocate memory for tensor "
<<
var_name
;
...
...
@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {
auto
*
var
=
scope
->
Var
(
var_name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
const
auto
&
data_type
=
var_desc
->
GetDataType
();
if
(
data_type
==
framework
::
proto
::
VarType
::
INT32
)
{
SetupTensor
<
int
>
(
tensor
,
shape
,
0
,
1
,
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
INT64
)
{
SetupTensor
<
int64_t
>
(
tensor
,
shape
,
0
,
1
,
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP32
)
{
SetupTensor
<
float
>
(
tensor
,
shape
,
static_cast
<
float
>
(
0.0
),
static_cast
<
float
>
(
1.0
));
static_cast
<
float
>
(
1.0
),
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
SetupTensor
<
double
>
(
tensor
,
shape
,
static_cast
<
double
>
(
0.0
),
static_cast
<
double
>
(
1.0
),
item
.
second
.
initializer
);
}
else
{
PADDLE_THROW
(
"Unsupported dtype %d."
,
data_type
);
}
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
std
::
vector
<
std
::
vector
<
size_t
>>
&
lod_vec
=
item
.
second
;
std
::
vector
<
std
::
vector
<
size_t
>>
&
lod_vec
=
item
.
second
.
lod
;
framework
::
LoD
lod
;
for
(
size_t
i
=
0
;
i
<
lod_vec
.
size
();
++
i
)
{
lod
.
push_back
(
lod_vec
[
i
]);
...
...
@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
ss
<<
GenSpaces
(
count
)
<<
"type: LOD_TENSOR
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"lod_tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"tensor {
\n
"
;
const
auto
&
data_type
=
var
->
GetDataType
();
if
(
data_type
==
framework
::
proto
::
VarType
::
INT32
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: INT32
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
INT64
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: INT64
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP32
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP32
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP64
\n
"
;
}
std
::
vector
<
int64_t
>
shape
=
var
->
GetShape
();
for
(
auto
d
:
shape
)
{
ss
<<
GenSpaces
(
count
)
<<
"dims: "
<<
d
<<
"
\n
"
;
...
...
@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
count
)
<<
"type: "
<<
op_desc_
.
Type
()
<<
"
\n
"
;
for
(
auto
&
name
:
op_desc_
.
AttrNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"attrs {
\n
"
;
const
auto
&
attr_type
=
op_desc_
.
GetAttrType
(
name
);
const
auto
&
attr
=
op_desc_
.
GetAttr
(
name
);
ss
<<
GenSpaces
(
count
)
<<
"name:
\"
"
<<
name
<<
"
\"\n
"
;
switch
(
attr_type
)
{
case
framework
::
proto
::
AttrType
::
BOOLEAN
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: BOOLEAN
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"b: "
<<
boost
::
get
<
bool
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
INT
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: INT
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"i: "
<<
boost
::
get
<
int
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
FLOAT
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: FLOAT
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"f: "
<<
boost
::
get
<
float
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
STRING
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: STRING
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"s:
\"
"
<<
boost
::
get
<
std
::
string
>
(
attr
)
<<
"
\"\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
BOOLEANS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: BOOLEANS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"bools: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
INTS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: INTS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"ints: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
FLOATS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: FLOATS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"floats: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
STRINGS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: STRINGS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"strings: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: LONG
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"l: "
<<
boost
::
get
<
int64_t
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: LONGS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"longs: "
<<
"
\n
"
;
}
break
;
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
attr_type
);
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
return
ss
.
str
();
}
...
...
@@ -299,6 +471,7 @@ TEST(op_tester, base) {
FLAGS_op_config_list
.
c_str
());
std
::
vector
<
OpTesterConfig
>
op_configs
;
while
(
!
fin
.
eof
())
{
VLOG
(
4
)
<<
"Reading config "
<<
op_configs
.
size
()
<<
"..."
;
OpTesterConfig
config
;
bool
result
=
config
.
Init
(
fin
);
if
(
result
)
{
...
...
paddle/fluid/operators/benchmark/op_tester.h
浏览文件 @
2c4fcaa6
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
...
...
@@ -39,16 +41,21 @@ class OpTester {
private:
std
::
vector
<
std
::
string
>
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoOutputNames
();
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
GetOpProtoAttrNames
();
framework
::
proto
::
VarType
::
Type
TransToVarType
(
std
::
string
str
);
void
CreateInputVarDesc
();
void
CreateOutputVarDesc
();
void
CreateOpDesc
();
framework
::
VarDesc
*
Var
(
const
std
::
string
&
name
);
void
CreateVariables
(
framework
::
Scope
*
scope
);
template
<
typename
T
>
void
SetupTensor
(
framework
::
LoDTensor
*
input
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
);
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
,
const
std
::
string
&
initializer
);
void
RunImpl
();
...
...
@@ -57,7 +64,7 @@ class OpTester {
std
::
string
type_
;
framework
::
OpDesc
op_desc_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
framework
::
VarDesc
>>
vars_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
vector
<
size_t
>>>
input_lod
s_
;
std
::
unordered_map
<
std
::
string
,
OpInputConfig
>
input
s_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
platform
::
Place
place_
;
std
::
unique_ptr
<
framework
::
Scope
>
scope_
;
...
...
paddle/fluid/operators/benchmark/op_tester_config.cc
浏览文件 @
2c4fcaa6
...
...
@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
}
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dtype"
||
sep
==
"dtype:"
)
{
ParseDType
(
is
);
}
else
if
(
sep
==
"initializer"
||
sep
==
"initializer:"
)
{
ParseInitializer
(
is
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
ParseDims
(
is
);
}
else
if
(
sep
==
"lod"
||
sep
==
"lod:"
)
{
ParseLoD
(
is
);
}
}
}
}
void
OpInputConfig
::
ParseDType
(
std
::
istream
&
is
)
{
std
::
string
dtype_str
;
is
>>
dtype_str
;
EraseEndSep
(
&
dtype_str
);
if
(
dtype_str
==
"int32"
||
dtype_str
==
"int"
)
{
dtype
=
"int32"
;
}
else
if
(
dtype_str
==
"int64"
||
dtype_str
==
"long"
)
{
dtype
=
"int64"
;
}
else
if
(
dtype_str
==
"fp32"
||
dtype_str
==
"float"
)
{
dtype
=
"fp32"
;
}
else
if
(
dtype_str
==
"fp64"
||
dtype_str
==
"double"
)
{
dtype
=
"fp64"
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s"
,
dtype_str
.
c_str
());
}
VLOG
(
4
)
<<
"dtype of input "
<<
name
<<
" is: "
<<
dtype
;
}
void
OpInputConfig
::
ParseInitializer
(
std
::
istream
&
is
)
{
std
::
string
initializer_str
;
is
>>
initializer_str
;
EraseEndSep
(
&
initializer_str
);
const
std
::
vector
<
std
::
string
>
supported_initializers
=
{
"random"
,
"natural"
,
"zeros"
};
if
(
!
Has
(
supported_initializers
,
initializer_str
))
{
PADDLE_THROW
(
"Unsupported initializer %s"
,
initializer_str
.
c_str
());
}
initializer
=
initializer_str
;
VLOG
(
4
)
<<
"initializer of input "
<<
name
<<
" is: "
<<
initializer
;
}
void
OpInputConfig
::
ParseDims
(
std
::
istream
&
is
)
{
std
::
string
dims_str
;
is
>>
dims_str
;
...
...
@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
number
+=
lod_str
[
i
];
++
i
;
}
level
.
push_back
(
atoi
(
number
.
c_str
()
));
level
.
push_back
(
StringTo
<
size_t
>
(
number
));
}
lod
.
push_back
(
level
);
}
else
if
(
lod_str
[
i
]
==
'}'
)
{
...
...
@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
}
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
ParseDims
(
is
);
}
else
if
(
sep
==
"lod"
||
sep
==
"lod:"
)
{
ParseLoD
(
is
);
}
}
}
}
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
...
...
@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
is
>>
value
;
EraseEndSep
(
&
key
,
":"
);
EraseEndSep
(
&
value
);
VLOG
(
4
)
<<
"attrs: "
<<
key
<<
", "
<<
value
;
attrs
[
key
]
=
value
;
}
...
...
paddle/fluid/operators/benchmark/op_tester_config.h
浏览文件 @
2c4fcaa6
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <istream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
...
...
@@ -27,10 +28,14 @@ struct OpInputConfig {
OpInputConfig
()
{}
explicit
OpInputConfig
(
std
::
istream
&
is
);
void
ParseDType
(
std
::
istream
&
is
);
void
ParseInitializer
(
std
::
istream
&
is
);
void
ParseDims
(
std
::
istream
&
is
);
void
ParseLoD
(
std
::
istream
&
is
);
std
::
string
name
;
std
::
string
dtype
{
"fp32"
};
// int32/int, int64/long, fp32/float, fp64/double
std
::
string
initializer
{
"random"
};
// random, natural
std
::
vector
<
int64_t
>
dims
;
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
};
...
...
@@ -55,6 +60,23 @@ struct OpTesterConfig {
double
runtime
{
0.0
};
};
static
bool
Has
(
const
std
::
vector
<
std
::
string
>&
vec
,
const
std
::
string
&
item
)
{
for
(
size_t
i
=
0
;
i
<
vec
.
size
();
++
i
)
{
if
(
vec
[
i
]
==
item
)
{
return
true
;
}
}
return
false
;
}
template
<
typename
T
>
T
StringTo
(
const
std
::
string
&
str
)
{
std
::
istringstream
is
(
str
);
T
value
;
is
>>
value
;
return
value
;
}
}
// namespace benchmark
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/cast_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
...
...
@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
Cast Operator.
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
returns the Output Tensor. It's meaningless if the output dtype equals
the input dtype, but it's fine if you do so.
)DOC"
);
}
...
...
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
...
@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library
(
generate_proposal_labels_op SRCS generate_proposal_labels_op.cc
)
detection_library
(
box_clip_op SRCS box_clip_op.cc box_clip_op.cu
)
detection_library
(
yolov3_loss_op SRCS yolov3_loss_op.cc
)
detection_library
(
box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu
)
if
(
WITH_GPU
)
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub
)
detection_library
(
distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub
)
else
()
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc
)
detection_library
(
distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc
)
endif
()
detection_library
(
roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu
)
...
...
paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
class
BoxDecoderAndAssignOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PriorBox"
),
"Input(PriorBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PriorBoxVar"
),
"Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"TargetBox"
),
"Input(TargetBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BoxScore"
),
"Input(BoxScore) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"DecodeBox"
),
"Output(DecodeBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"OutputAssignBox"
),
"Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null."
);
auto
prior_box_dims
=
ctx
->
GetInputDim
(
"PriorBox"
);
auto
prior_box_var_dims
=
ctx
->
GetInputDim
(
"PriorBoxVar"
);
auto
target_box_dims
=
ctx
->
GetInputDim
(
"TargetBox"
);
auto
box_score_dims
=
ctx
->
GetInputDim
(
"BoxScore"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
.
size
(),
2
,
"The rank of Input of PriorBox must be 2"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
1
],
4
,
"The shape of PriorBox is [N, 4]"
);
PADDLE_ENFORCE_EQ
(
prior_box_var_dims
.
size
(),
1
,
"The rank of Input of PriorBoxVar must be 1"
);
PADDLE_ENFORCE_EQ
(
prior_box_var_dims
[
0
],
4
,
"The shape of PriorBoxVar is [4]"
);
PADDLE_ENFORCE_EQ
(
target_box_dims
.
size
(),
2
,
"The rank of Input of TargetBox must be 2"
);
PADDLE_ENFORCE_EQ
(
box_score_dims
.
size
(),
2
,
"The rank of Input of BoxScore must be 2"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
0
],
target_box_dims
[
0
],
"The first dim of prior_box and target_box is roi nums "
"and should be same!"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
0
],
box_score_dims
[
0
],
"The first dim of prior_box and box_score is roi nums "
"and should be same!"
);
PADDLE_ENFORCE_EQ
(
target_box_dims
[
1
],
box_score_dims
[
1
]
*
prior_box_dims
[
1
],
"The shape of target_box is [N, classnum * 4], The shape "
"of box_score is [N, classnum], The shape of prior_box "
"is [N, 4]"
);
ctx
->
SetOutputDim
(
"DecodeBox"
,
framework
::
make_ddim
({
target_box_dims
[
0
],
target_box_dims
[
1
]}));
ctx
->
ShareLoD
(
"TargetBox"
,
/*->*/
"DecodeBox"
);
ctx
->
SetOutputDim
(
"OutputAssignBox"
,
framework
::
make_ddim
({
prior_box_dims
[
0
],
prior_box_dims
[
1
]}));
ctx
->
ShareLoD
(
"PriorBox"
,
/*->*/
"OutputAssignBox"
);
}
};
class
BoxDecoderAndAssignOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"PriorBox"
,
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
"boxes and each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box."
);
AddInput
(
"PriorBoxVar"
,
"(Tensor, default Tensor<float>, optional) "
"PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
"group of variance. PriorBoxVar will set all elements to 1 by "
"default."
)
.
AsDispensable
();
AddInput
(
"TargetBox"
,
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum*4]. It holds N targets for N boxes."
);
AddInput
(
"BoxScore"
,
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum], each box is represented as [classnum] which is "
"the classification probabilities."
);
AddAttr
<
float
>
(
"box_clip"
,
"(float, default 4.135, np.log(1000. / 16.)) "
"clip box to prevent overflowing"
)
.
SetDefault
(
4.135
f
);
AddOutput
(
"DecodeBox"
,
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, classnum * 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances for each class."
);
AddOutput
(
"OutputAssignBox"
,
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances with the best non-background class "
"by BoxScore."
);
AddComment
(
R"DOC(
Bounding Box Coder.
Decode the target bounding box with the prior_box information.
The Decoding schema is described below:
$$
ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2}
$$
$$
oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
$$
$$
ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
$$
$$
oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
$$
where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
decoded coordinates, width and height in decode_box.
decode_box is obtained after box decode, then assigning schema is described below:
For each prior_box, use the best non-background class's decoded values to
update the prior_box locations and get output_assign_box. So, the shape of
output_assign_box is the same as PriorBox.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignOp
,
ops
::
BoxDecoderAndAssignOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
BoxDecoderAndAssignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
DecodeBoxKernel
(
const
T
*
prior_box_data
,
const
T
*
prior_box_var_data
,
const
T
*
target_box_data
,
const
int
roi_num
,
const
int
class_num
,
const
T
box_clip
,
T
*
output_box_data
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
roi_num
*
class_num
)
{
int
i
=
idx
/
class_num
;
int
j
=
idx
%
class_num
;
T
prior_box_width
=
prior_box_data
[
i
*
4
+
2
]
-
prior_box_data
[
i
*
4
]
+
1
;
T
prior_box_height
=
prior_box_data
[
i
*
4
+
3
]
-
prior_box_data
[
i
*
4
+
1
]
+
1
;
T
prior_box_center_x
=
prior_box_data
[
i
*
4
]
+
prior_box_width
/
2
;
T
prior_box_center_y
=
prior_box_data
[
i
*
4
+
1
]
+
prior_box_height
/
2
;
int
offset
=
i
*
class_num
*
4
+
j
*
4
;
T
dw
=
prior_box_var_data
[
2
]
*
target_box_data
[
offset
+
2
];
T
dh
=
prior_box_var_data
[
3
]
*
target_box_data
[
offset
+
3
];
if
(
dw
>
box_clip
)
{
dw
=
box_clip
;
}
if
(
dh
>
box_clip
)
{
dh
=
box_clip
;
}
T
target_box_center_x
=
0
,
target_box_center_y
=
0
;
T
target_box_width
=
0
,
target_box_height
=
0
;
target_box_center_x
=
prior_box_var_data
[
0
]
*
target_box_data
[
offset
]
*
prior_box_width
+
prior_box_center_x
;
target_box_center_y
=
prior_box_var_data
[
1
]
*
target_box_data
[
offset
+
1
]
*
prior_box_height
+
prior_box_center_y
;
target_box_width
=
expf
(
dw
)
*
prior_box_width
;
target_box_height
=
expf
(
dh
)
*
prior_box_height
;
output_box_data
[
offset
]
=
target_box_center_x
-
target_box_width
/
2
;
output_box_data
[
offset
+
1
]
=
target_box_center_y
-
target_box_height
/
2
;
output_box_data
[
offset
+
2
]
=
target_box_center_x
+
target_box_width
/
2
-
1
;
output_box_data
[
offset
+
3
]
=
target_box_center_y
+
target_box_height
/
2
-
1
;
}
}
template
<
typename
T
>
__global__
void
AssignBoxKernel
(
const
T
*
prior_box_data
,
const
T
*
box_score_data
,
T
*
output_box_data
,
const
int
roi_num
,
const
int
class_num
,
T
*
output_assign_box_data
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
roi_num
)
{
int
i
=
idx
;
T
max_score
=
-
1
;
int
max_j
=
-
1
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
T
score
=
box_score_data
[
i
*
class_num
+
j
];
if
(
score
>
max_score
&&
j
>
0
)
{
max_score
=
score
;
max_j
=
j
;
}
}
if
(
max_j
>
0
)
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
output_box_data
[
i
*
class_num
*
4
+
max_j
*
4
+
pno
];
}
}
else
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
prior_box_data
[
i
*
4
+
pno
];
}
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
BoxDecoderAndAssignCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
context
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
*
prior_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
box_score
=
context
.
Input
<
framework
::
LoDTensor
>
(
"BoxScore"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"DecodeBox"
);
auto
*
output_assign_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputAssignBox"
);
auto
roi_num
=
target_box
->
dims
()[
0
];
auto
class_num
=
box_score
->
dims
()[
1
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
auto
*
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
auto
*
box_score_data
=
box_score
->
data
<
T
>
();
output_box
->
mutable_data
<
T
>
({
roi_num
,
class_num
*
4
},
context
.
GetPlace
());
output_assign_box
->
mutable_data
<
T
>
({
roi_num
,
4
},
context
.
GetPlace
());
T
*
output_box_data
=
output_box
->
data
<
T
>
();
T
*
output_assign_box_data
=
output_assign_box
->
data
<
T
>
();
int
block
=
512
;
int
grid
=
(
roi_num
*
class_num
+
block
-
1
)
/
block
;
auto
&
device_ctx
=
context
.
cuda_device_context
();
const
T
box_clip
=
context
.
Attr
<
T
>
(
"box_clip"
);
DecodeBoxKernel
<
T
><<<
grid
,
block
,
0
,
device_ctx
.
stream
()
>>>
(
prior_box_data
,
prior_box_var_data
,
target_box_data
,
roi_num
,
class_num
,
box_clip
,
output_box_data
);
context
.
device_context
().
Wait
();
int
assign_grid
=
(
roi_num
+
block
-
1
)
/
block
;
AssignBoxKernel
<
T
><<<
assign_grid
,
block
,
0
,
device_ctx
.
stream
()
>>>
(
prior_box_data
,
box_score_data
,
output_box_data
,
roi_num
,
class_num
,
output_assign_box_data
);
context
.
device_context
().
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
BoxDecoderAndAssignCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
BoxDecoderAndAssignKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
prior_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
box_score
=
context
.
Input
<
framework
::
LoDTensor
>
(
"BoxScore"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"DecodeBox"
);
auto
*
output_assign_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputAssignBox"
);
int
roi_num
=
target_box
->
dims
()[
0
];
int
class_num
=
box_score
->
dims
()[
1
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
auto
*
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
auto
*
box_score_data
=
box_score
->
data
<
T
>
();
output_box
->
mutable_data
<
T
>
({
roi_num
,
class_num
*
4
},
context
.
GetPlace
());
output_assign_box
->
mutable_data
<
T
>
({
roi_num
,
4
},
context
.
GetPlace
());
T
*
output_box_data
=
output_box
->
data
<
T
>
();
T
*
output_assign_box_data
=
output_assign_box
->
data
<
T
>
();
const
T
bbox_clip
=
context
.
Attr
<
T
>
(
"box_clip"
);
for
(
int
i
=
0
;
i
<
roi_num
;
++
i
)
{
T
prior_box_width
=
prior_box_data
[
i
*
4
+
2
]
-
prior_box_data
[
i
*
4
]
+
1
;
T
prior_box_height
=
prior_box_data
[
i
*
4
+
3
]
-
prior_box_data
[
i
*
4
+
1
]
+
1
;
T
prior_box_center_x
=
prior_box_data
[
i
*
4
]
+
prior_box_width
/
2
;
T
prior_box_center_y
=
prior_box_data
[
i
*
4
+
1
]
+
prior_box_height
/
2
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
int64_t
offset
=
i
*
class_num
*
4
+
j
*
4
;
T
dw
=
std
::
min
(
prior_box_var_data
[
2
]
*
target_box_data
[
offset
+
2
],
bbox_clip
);
T
dh
=
std
::
min
(
prior_box_var_data
[
3
]
*
target_box_data
[
offset
+
3
],
bbox_clip
);
T
target_box_center_x
=
0
,
target_box_center_y
=
0
;
T
target_box_width
=
0
,
target_box_height
=
0
;
target_box_center_x
=
prior_box_var_data
[
0
]
*
target_box_data
[
offset
]
*
prior_box_width
+
prior_box_center_x
;
target_box_center_y
=
prior_box_var_data
[
1
]
*
target_box_data
[
offset
+
1
]
*
prior_box_height
+
prior_box_center_y
;
target_box_width
=
std
::
exp
(
dw
)
*
prior_box_width
;
target_box_height
=
std
::
exp
(
dh
)
*
prior_box_height
;
output_box_data
[
offset
]
=
target_box_center_x
-
target_box_width
/
2
;
output_box_data
[
offset
+
1
]
=
target_box_center_y
-
target_box_height
/
2
;
output_box_data
[
offset
+
2
]
=
target_box_center_x
+
target_box_width
/
2
-
1
;
output_box_data
[
offset
+
3
]
=
target_box_center_y
+
target_box_height
/
2
-
1
;
}
T
max_score
=
-
1
;
int
max_j
=
-
1
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
T
score
=
box_score_data
[
i
*
class_num
+
j
];
if
(
score
>
max_score
&&
j
>
0
)
{
max_score
=
score
;
max_j
=
j
;
}
}
if
(
max_j
>
0
)
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
output_box_data
[
i
*
class_num
*
4
+
max_j
*
4
+
pno
];
}
}
else
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
prior_box_data
[
i
*
4
+
pno
];
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
namespace
paddle
{
namespace
operators
{
class
DistributeFpnProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"FpnRois"
),
"Input(FpnRois) shouldn't be null"
);
PADDLE_ENFORCE_GE
(
ctx
->
Outputs
(
"MultiFpnRois"
).
size
(),
1UL
,
"Outputs(MultiFpnRois) of DistributeOp should not be empty"
);
size_t
min_level
=
static_cast
<
size_t
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"min_level"
));
size_t
max_level
=
static_cast
<
size_t
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"max_level"
));
PADDLE_ENFORCE_GE
(
max_level
,
min_level
,
"max_level must not lower than min_level"
);
// Set the output shape
size_t
num_out_rois
=
max_level
-
min_level
+
1
;
std
::
vector
<
framework
::
DDim
>
outs_dims
;
outs_dims
.
reserve
(
num_out_rois
);
for
(
size_t
i
=
0
;
i
<
num_out_rois
;
++
i
)
{
framework
::
DDim
out_dim
=
{
-
1
,
4
};
outs_dims
.
push_back
(
out_dim
);
}
ctx
->
SetOutputsDim
(
"MultiFpnRois"
,
outs_dims
);
ctx
->
SetOutputDim
(
"RestoreIndex"
,
{
1
,
-
1
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"FpnRois"
));
return
framework
::
OpKernelType
(
data_type
,
platform
::
CPUPlace
());
}
};
class
DistributeFpnProposalsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"FpnRois"
,
"(LoDTensor) The rois at all levels in shape (-1, 4)"
);
AddOutput
(
"MultiFpnRois"
,
"(LoDTensor) Output with distribute operator"
)
.
AsDuplicable
();
AddOutput
(
"RestoreIndex"
,
"(Tensor) An array of positive number which is "
"used to restore the order of FpnRois"
);
AddAttr
<
int
>
(
"min_level"
,
"The lowest level of FPN layer where the"
" proposals come from"
);
AddAttr
<
int
>
(
"max_level"
,
"The highest level of FPN layer where the"
" proposals come from"
);
AddAttr
<
int
>
(
"refer_level"
,
"The referring level of FPN layer with"
" specified scale"
);
AddAttr
<
int
>
(
"refer_scale"
,
"The referring scale of FPN layer with"
" specified level"
);
AddComment
(
R"DOC(
This operator distribute all proposals into different fpn level,
with respect to scale of the proposals, the referring scale and
the referring level. Besides, to restore the order of proposals,
we return an array which indicate the original index of rois in
current proposals.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
distribute_fpn_proposals
,
ops
::
DistributeFpnProposalsOp
,
ops
::
DistributeFpnProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
distribute_fpn_proposals
,
ops
::
DistributeFpnProposalsOpKernel
<
float
>
,
ops
::
DistributeFpnProposalsOpKernel
<
double
>
);
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include "cub/cub.cuh"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaxinumNumBlocks
=
4096
;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int
const
BBoxSize
=
4
;
struct
RangeInitFunctor
{
int
start_
;
int
delta_
;
int
*
out_
;
__device__
void
operator
()(
size_t
i
)
{
out_
[
i
]
=
start_
+
i
*
delta_
;
}
};
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaxinumNumBlocks
);
}
static
inline
void
TransLoD
(
const
int
*
length_lod
,
const
int
lod_size
,
int
*
offset_lod
)
{
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
lod_size
;
++
i
)
{
offset_lod
[
i
]
=
offset
;
offset
+=
length_lod
[
i
];
}
}
template
<
typename
T
>
static
__device__
inline
T
RoIArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
class
T
>
static
__global__
void
GPUDistFpnProposalsHelper
(
const
int
nthreads
,
const
T
*
rois
,
const
int
lod_size
,
const
int
refer_level
,
const
int
refer_scale
,
const
int
max_level
,
const
int
min_level
,
int
*
roi_batch_id_data
,
int
*
sub_lod_list
,
int
*
target_lvls
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
nthreads
)
{
const
T
*
offset_roi
=
rois
+
i
*
BBoxSize
;
int
roi_batch_ind
=
roi_batch_id_data
[
i
];
// get the target level of current rois
T
roi_area
=
RoIArea
(
offset_roi
,
false
);
T
roi_scale
=
sqrt
(
roi_area
);
int
tgt_lvl
=
floor
(
log2
(
roi_scale
/
refer_scale
)
+
refer_level
);
tgt_lvl
=
min
(
max_level
,
max
(
tgt_lvl
,
min_level
));
target_lvls
[
i
]
=
tgt_lvl
;
// compute number of rois in the same batch and same target level
platform
::
CudaAtomicAdd
(
sub_lod_list
+
tgt_lvl
*
lod_size
+
roi_batch_ind
,
1
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
GPUDistributeFpnProposalsOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
fpn_rois
=
ctx
.
Input
<
paddle
::
framework
::
LoDTensor
>
(
"FpnRois"
);
auto
multi_fpn_rois
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"MultiFpnRois"
);
auto
*
restore_index
=
ctx
.
Output
<
Tensor
>
(
"RestoreIndex"
);
const
int
min_level
=
ctx
.
Attr
<
int
>
(
"min_level"
);
const
int
max_level
=
ctx
.
Attr
<
int
>
(
"max_level"
);
const
int
refer_level
=
ctx
.
Attr
<
int
>
(
"refer_level"
);
const
int
refer_scale
=
ctx
.
Attr
<
int
>
(
"refer_scale"
);
int
num_level
=
max_level
-
min_level
+
1
;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ
(
fpn_rois
->
lod
().
size
(),
1UL
,
"DistributeFpnProposalsOp need 1 level of LoD"
);
auto
fpn_rois_lod
=
fpn_rois
->
lod
().
back
();
int
lod_size
=
fpn_rois_lod
.
size
()
-
1
;
int
roi_num
=
fpn_rois_lod
[
lod_size
];
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
// get batch id by lod in CPU
Tensor
roi_batch_id_list
;
roi_batch_id_list
.
Resize
({
roi_num
});
int
*
roi_batch_id_data
=
roi_batch_id_list
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
n
=
0
;
n
<
lod_size
;
++
n
)
{
for
(
size_t
i
=
fpn_rois_lod
[
n
];
i
<
fpn_rois_lod
[
n
+
1
];
++
i
)
{
roi_batch_id_data
[
i
]
=
n
;
}
}
// copy batch id list to GPU
Tensor
roi_batch_id_list_gpu
;
framework
::
TensorCopySync
(
roi_batch_id_list
,
dev_ctx
.
GetPlace
(),
&
roi_batch_id_list_gpu
);
Tensor
sub_lod_list
;
sub_lod_list
.
Resize
({
num_level
,
lod_size
});
int
*
sub_lod_list_data
=
sub_lod_list
.
mutable_data
<
int
>
(
dev_ctx
.
GetPlace
());
Tensor
target_lvls
;
target_lvls
.
Resize
({
roi_num
});
int
*
target_lvls_data
=
target_lvls
.
mutable_data
<
int
>
(
dev_ctx
.
GetPlace
());
int
blocks
=
NumBlocks
(
roi_num
);
int
threads
=
kNumCUDAThreads
;
// get target levels and sub_lod list
GPUDistFpnProposalsHelper
<
T
><<<
blocks
,
threads
>>>
(
roi_num
,
fpn_rois
->
data
<
T
>
(),
lod_size
,
refer_level
,
refer_scale
,
max_level
,
min_level
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
sub_lod_list_data
,
target_lvls_data
);
Tensor
index_in_t
;
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
dev_ctx
,
roi_num
);
for_range
(
RangeInitFunctor
{
0
,
1
,
idx_in
});
Tensor
keys_out_t
;
int
*
keys_out
=
keys_out_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
Tensor
index_out_t
;
int
*
idx_out
=
index_out_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
// Determine temporary device storage requirements
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
// Allocate temporary storage
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
auto
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
,
memory
::
Allocator
::
kScratchpad
);
// Run sorting operation
// sort target level to get corresponding index
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
int
*
restore_idx_data
=
restore_index
->
mutable_data
<
int
>
({
roi_num
,
1
},
dev_ctx
.
GetPlace
());
// sort current index to get restore index
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
idx_out
,
keys_out
,
idx_in
,
restore_idx_data
,
roi_num
);
Tensor
offset_lod
;
int
*
offset_lod_data
=
offset_lod
.
mutable_data
<
int
>
({
lod_size
+
1
},
dev_ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
Tensor
sub_lod
=
sub_lod_list
.
Slice
(
i
,
i
+
1
);
int
*
sub_lod_data
=
sub_lod
.
data
<
int
>
();
// transfer length-based lod to offset-based lod
TransLoD
(
sub_lod_data
,
lod_size
+
1
,
offset_lod_data
);
int
sub_rois_num
=
offset_lod_data
[
lod_size
];
Tensor
sub_idx
=
index_out_t
.
Slice
(
0
,
sub_rois_num
);
multi_fpn_rois
[
i
]
->
mutable_data
<
T
>
({
sub_rois_num
,
kBoxDim
},
dev_ctx
.
GetPlace
());
GPUGather
<
T
>
(
dev_ctx
,
*
fpn_rois
,
sub_idx
,
multi_fpn_rois
[
i
]);
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
offset
;
memory
::
Copy
(
platform
::
CPUPlace
(),
offset
.
data
(),
place
,
offset_lod_data
,
sizeof
(
int
)
*
(
lod_size
+
1
),
0
);
lod
.
emplace_back
(
offset
);
multi_fpn_rois
[
i
]
->
set_lod
(
lod
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
distribute_fpn_proposals
,
ops
::
GPUDistributeFpnProposalsOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GPUDistributeFpnProposalsOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
const
int
kBoxDim
=
4
;
template
<
typename
T
>
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
typename
T
>
class
DistributeFpnProposalsOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
fpn_rois
=
context
.
Input
<
paddle
::
framework
::
LoDTensor
>
(
"FpnRois"
);
auto
multi_fpn_rois
=
context
.
MultiOutput
<
paddle
::
framework
::
LoDTensor
>
(
"MultiFpnRois"
);
auto
*
restore_index
=
context
.
Output
<
paddle
::
framework
::
Tensor
>
(
"RestoreIndex"
);
const
int
min_level
=
context
.
Attr
<
int
>
(
"min_level"
);
const
int
max_level
=
context
.
Attr
<
int
>
(
"max_level"
);
const
int
refer_level
=
context
.
Attr
<
int
>
(
"refer_level"
);
const
int
refer_scale
=
context
.
Attr
<
int
>
(
"refer_scale"
);
const
int
num_level
=
max_level
-
min_level
+
1
;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ
(
fpn_rois
->
lod
().
size
(),
1UL
,
"DistributeFpnProposalsOp need 1 level of LoD"
);
auto
fpn_rois_lod
=
fpn_rois
->
lod
().
back
();
int
fpn_rois_num
=
fpn_rois_lod
[
fpn_rois_lod
.
size
()
-
1
];
std
::
vector
<
int
>
target_level
;
// std::vector<int> target_level(fpn_rois_num, -1);
// record the number of rois in each level
std
::
vector
<
int
>
num_rois_level
(
num_level
,
0
);
std
::
vector
<
int
>
num_rois_level_integral
(
num_level
+
1
,
0
);
for
(
int
i
=
0
;
i
<
fpn_rois_lod
.
size
()
-
1
;
++
i
)
{
Tensor
fpn_rois_slice
=
fpn_rois
->
Slice
(
fpn_rois_lod
[
i
],
fpn_rois_lod
[
i
+
1
]);
const
T
*
rois_data
=
fpn_rois_slice
.
data
<
T
>
();
for
(
int
j
=
0
;
j
<
fpn_rois_slice
.
dims
()[
0
];
++
j
)
{
// get the target level of current rois
T
roi_scale
=
std
::
sqrt
(
BBoxArea
(
rois_data
,
false
));
int
tgt_lvl
=
std
::
floor
(
std
::
log2
(
roi_scale
/
refer_scale
)
+
refer_level
);
tgt_lvl
=
std
::
min
(
max_level
,
std
::
max
(
tgt_lvl
,
min_level
));
target_level
.
push_back
(
tgt_lvl
);
num_rois_level
[
tgt_lvl
-
min_level
]
++
;
rois_data
+=
kBoxDim
;
}
}
// define the output rois
// pointer which point to each level fpn rois
std
::
vector
<
T
*>
multi_fpn_rois_data
(
num_level
);
// lod0 which will record the offset information of each level rois
std
::
vector
<
std
::
vector
<
size_t
>>
multi_fpn_rois_lod0
;
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
// allocate memory for each level rois
multi_fpn_rois
[
i
]
->
mutable_data
<
T
>
({
num_rois_level
[
i
],
kBoxDim
},
context
.
GetPlace
());
multi_fpn_rois_data
[
i
]
=
multi_fpn_rois
[
i
]
->
data
<
T
>
();
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
multi_fpn_rois_lod0
.
push_back
(
lod0
);
// statistic start point for each level rois
num_rois_level_integral
[
i
+
1
]
=
num_rois_level_integral
[
i
]
+
num_rois_level
[
i
];
}
restore_index
->
mutable_data
<
int
>
({
1
,
fpn_rois_num
},
context
.
GetPlace
());
int
*
restore_index_data
=
restore_index
->
data
<
int
>
();
std
::
vector
<
int
>
restore_index_inter
(
fpn_rois_num
,
-
1
);
// distribute the rois into different fpn level by target level
for
(
int
i
=
0
;
i
<
fpn_rois_lod
.
size
()
-
1
;
++
i
)
{
Tensor
fpn_rois_slice
=
fpn_rois
->
Slice
(
fpn_rois_lod
[
i
],
fpn_rois_lod
[
i
+
1
]);
const
T
*
rois_data
=
fpn_rois_slice
.
data
<
T
>
();
size_t
cur_offset
=
fpn_rois_lod
[
i
];
// std::vector<size_t > lod_offset[num_level];
for
(
int
j
=
0
;
j
<
num_level
;
j
++
)
{
multi_fpn_rois_lod0
[
j
].
push_back
(
multi_fpn_rois_lod0
[
j
][
i
]);
}
for
(
int
j
=
0
;
j
<
fpn_rois_slice
.
dims
()[
0
];
++
j
)
{
int
lvl
=
target_level
[
cur_offset
+
j
];
memcpy
(
multi_fpn_rois_data
[
lvl
-
min_level
],
rois_data
,
kBoxDim
*
sizeof
(
T
));
multi_fpn_rois_data
[
lvl
-
min_level
]
+=
kBoxDim
;
int
index_in_shuffle
=
num_rois_level_integral
[
lvl
-
min_level
]
+
multi_fpn_rois_lod0
[
lvl
-
min_level
][
i
+
1
];
restore_index_inter
[
index_in_shuffle
]
=
cur_offset
+
j
;
multi_fpn_rois_lod0
[
lvl
-
min_level
][
i
+
1
]
++
;
rois_data
+=
kBoxDim
;
}
}
for
(
int
i
=
0
;
i
<
fpn_rois_num
;
++
i
)
{
restore_index_data
[
restore_index_inter
[
i
]]
=
i
;
}
// merge lod information into LoDTensor
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
framework
::
LoD
lod
;
lod
.
emplace_back
(
multi_fpn_rois_lod0
[
i
]);
multi_fpn_rois
[
i
]
->
set_lod
(
lod
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
2c4fcaa6
...
...
@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
PADDLE_ENFORCE_LE
(
table_width
*
idx_width
,
out_width
);
PADDLE_ENFORCE_GT
(
ids_lod
.
size
(),
1UL
);
PADDLE_ENFORCE_GT
(
ids_lod
.
size
(),
1UL
,
"The LoD[0] could NOT be empty"
);
jit
::
emb_seq_pool_attr_t
attr
(
table_height
,
table_width
,
0
,
idx_width
,
out_width
,
jit
::
SeqPoolType
::
kSum
);
...
...
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
FusedEmbeddingSeqPoolLastDim
(
table_var
->
dims
(),
ids_t
->
dims
());
const
auto
&
ids_lod
=
ids_t
->
lod
();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1
u
,
"The LoD level of Input(Ids) must be 1"
);
PADDLE_ENFORCE_GE
(
ids_lod
[
0
].
size
(),
1u
,
"The LoD could NOT be empty
"
);
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1
UL
,
"The LoD level of Input(Ids) must be 1
"
);
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size,
embedding_size
]
// should be [seq_length, 1] -> [batch_size,
last_dim
]
output_t
->
Resize
({
batch_size
,
last_dim
});
if
(
combiner_type
==
"sum"
)
{
...
...
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
auto
lod
=
ids
->
lod
()[
0
];
int64_t
row
_width
=
d_output
->
dims
()[
1
];
int64_t
out
_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
...
...
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
auto
vbroadcast
=
jit
::
Get
<
jit
::
kVBroadcast
,
jit
::
VBroadcastTuples
<
T
>
,
platform
::
CPUPlace
>
(
out_width
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
in_offset
=
lod
[
i
]
*
row_width
;
const
T
*
out_pos
=
d_output_data
+
i
*
row_width
;
T
*
in_pos
=
d_table_data
+
in_offset
;
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
row_width
,
out_pos
,
in_pos
+
r
*
row_width
);
}
const
T
*
src
=
d_output_data
+
i
*
out_width
;
T
*
dst
=
d_table_data
+
lod
[
i
]
*
out_width
;
vbroadcast
(
src
,
dst
,
h
,
out_width
);
}
}
else
{
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
...
...
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
2c4fcaa6
...
...
@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchVBroadcastKernel
()
{
for
(
int64_t
w
:
{
1
,
16
,
64
,
100
,
256
})
{
Tensor
x
;
x
.
Resize
({
w
});
RandomVec
<
T
>
(
w
,
x
.
mutable_data
<
T
>
(
PlaceType
()));
const
T
*
x_data
=
x
.
data
<
T
>
();
for
(
int
h
:
TestSizes
())
{
Tensor
y
;
y
.
Resize
({
h
*
w
});
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
VBroadcastTuples
<
T
>
,
PlaceType
>
(
w
,
x_data
,
y_data
,
static_cast
<
int64_t
>
(
h
),
w
);
}
}
}
using
T
=
float
;
using
CPUPlace
=
paddle
::
platform
::
CPUPlace
;
...
...
@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU
(
kVExp
)
{
BenchXYNKernel
<
jit
::
kVExp
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSigmoid
)
{
BenchXYNKernel
<
jit
::
kVSigmoid
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVTanh
)
{
BenchXYNKernel
<
jit
::
kVTanh
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVCopy
)
{
BenchXYNKernel
<
jit
::
kVCopy
,
T
,
CPUPlace
>
();
}
// lstm and peephole
BENCH_FP32_CPU
(
kLSTMCtHt
)
{
BenchLSTMKernel
<
jit
::
kLSTMCtHt
,
T
,
CPUPlace
>
();
}
...
...
@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel
<
jit
::
kCRFDecoding
,
T
,
CPUPlace
>
();
}
// vbroadcast function
BENCH_FP32_CPU
(
kVBroadcast
)
{
BenchVBroadcastKernel
<
jit
::
kVBroadcast
,
T
,
CPUPlace
>
();
}
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// Options:
...
...
paddle/fluid/operators/jit/gen/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
...
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN
(
kHSum
)
USE_JITKERNEL_GEN
(
kEmbSeqPool
)
USE_JITKERNEL_GEN
(
kSgd
)
USE_JITKERNEL_GEN
(
kVBroadcast
)
paddle/fluid/operators/jit/gen/vbroadcast.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
void
VBroadcastJitCode
::
genCode
()
{
preCode
();
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
constexpr
int
max_num_regs
=
16
;
const
int
num_block
=
w_
/
block
;
const
int
num_groups
=
num_block
/
max_num_regs
;
const
size_t
block_size
=
sizeof
(
float
)
*
block
;
std
::
vector
<
int
>
groups
(
num_groups
,
max_num_regs
);
int
rest_num_regs
=
num_block
%
max_num_regs
;
if
(
rest_num_regs
>
0
)
{
groups
.
push_back
(
rest_num_regs
);
}
// protect param_h
mov
(
reg_height
,
param_h
);
Label
l_next_h
;
xor_
(
reg_h_i
,
reg_h_i
);
mov
(
reg_ptr_dst_i
,
param_dst
);
L
(
l_next_h
);
{
mov
(
reg_ptr_src_i
,
param_src
);
for
(
int
num_regs
:
groups
)
{
size_t
w_offset
=
0
;
for
(
int
reg_i
=
0
;
reg_i
<
num_regs
;
++
reg_i
)
{
vmovups
(
ymm_t
(
reg_i
),
ptr
[
reg_ptr_src_i
+
w_offset
]);
w_offset
+=
block_size
;
}
add
(
reg_ptr_src_i
,
num_regs
*
block_size
);
w_offset
=
0
;
for
(
int
reg_i
=
0
;
reg_i
<
num_regs
;
++
reg_i
)
{
vmovups
(
ptr
[
reg_ptr_dst_i
+
w_offset
],
ymm_t
(
reg_i
));
w_offset
+=
block_size
;
}
add
(
reg_ptr_dst_i
,
num_regs
*
block_size
);
}
// end of groups
inc
(
reg_h_i
);
cmp
(
reg_h_i
,
reg_height
);
jl
(
l_next_h
,
T_NEAR
);
}
// end of l_next_h
postCode
();
}
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
public:
bool
UseMe
(
const
int64_t
&
w
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
}
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int64_t
&
w
)
const
override
{
PADDLE_ENFORCE_GT
(
w
,
0
);
return
make_unique
<
VBroadcastJitCode
>
(
w
,
CodeSize
(
w
));
}
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
namespace
gen
=
paddle
::
operators
::
jit
::
gen
;
REGISTER_JITKERNEL_GEN
(
kVBroadcast
,
gen
::
VBroadcastCreator
);
paddle/fluid/operators/jit/gen/vbroadcast.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
class
VBroadcastJitCode
:
public
JitCode
{
public:
explicit
VBroadcastJitCode
(
const
int64_t
&
w
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
w_
(
w
)
{
this
->
genCode
();
}
DECLARE_JIT_CODE
(
VBroadcastJitCode
);
void
genCode
()
override
;
private:
int
w_
;
reg64_t
param_src
{
abi_param1
};
reg64_t
param_dst
{
abi_param2
};
reg64_t
param_h
{
abi_param3
};
reg64_t
param_w
{
abi_param4
};
reg64_t
reg_height
{
r9
};
reg64_t
reg_h_i
{
r10
};
reg64_t
reg_ptr_src_i
{
r11
};
reg64_t
reg_ptr_dst_i
{
r12
};
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/helper.cc
浏览文件 @
2c4fcaa6
...
...
@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kVScal
);
ONE_CASE
(
kVAddBias
);
ONE_CASE
(
kVRelu
);
ONE_CASE
(
kVBroadcast
);
ONE_CASE
(
kVCopy
);
ONE_CASE
(
kVIdentity
);
ONE_CASE
(
kVExp
);
ONE_CASE
(
kVSquare
);
...
...
paddle/fluid/operators/jit/kernel_base.h
浏览文件 @
2c4fcaa6
...
...
@@ -41,6 +41,8 @@ typedef enum {
kVAdd
,
kVAddBias
,
kVAddRelu
,
kVBroadcast
,
kVCopy
,
kVExp
,
kVIdentity
,
kVMul
,
...
...
@@ -133,6 +135,13 @@ struct GRUTuples {
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
};
template
<
typename
T
>
struct
VBroadcastTuples
{
typedef
T
data_type
;
typedef
int64_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int64_t
,
int64_t
);
};
typedef
struct
seq_pool_attr_s
{
int
h
,
w
;
// h should always be the first one
SeqPoolType
type
;
...
...
paddle/fluid/operators/jit/kernel_key.cc
浏览文件 @
2c4fcaa6
...
...
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
return
d
;
}
template
<
>
size_t
JitCodeKey
<
int64_t
>
(
const
int64_t
&
d
)
{
return
d
;
}
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr
int
act_type_shift
=
3
;
// suppot 2^3 act types
static
inline
int
act_type_convert
(
KernelType
type
)
{
...
...
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
...
@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
USE_JITKERNEL_MORE
(
kVScal, mkl
)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVSquare, mkl
)
USE_JITKERNEL_MORE
(
kVCopy, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kSoftmax, mkl
)
USE_JITKERNEL_MORE
(
kEmbSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kSgd, mkl
)
USE_JITKERNEL_MORE
(
kVBroadcast, mkl
)
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
2c4fcaa6
...
...
@@ -154,6 +154,21 @@ bool VSquareKernel<float>::UseMe(const int& d) const {
return
d
>
7
;
}
template
<
>
bool
VCopyKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
return
d
>
15
;
}
template
<
>
bool
VBroadcastKernel
<
float
>::
UseMe
(
const
int64_t
&
d
)
const
{
return
d
>
127
;
}
template
<
>
bool
VBroadcastKernel
<
double
>::
UseMe
(
const
int64_t
&
attr
)
const
{
return
true
;
}
template
<
>
bool
VSigmoidKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
return
d
>
7
;
...
...
@@ -223,6 +238,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE
(
VSigmoid
);
AWALYS_USE_ME_WITH_DOUBLE
(
VTanh
);
AWALYS_USE_ME_WITH_DOUBLE
(
VSquare
);
AWALYS_USE_ME_WITH_DOUBLE
(
VCopy
);
AWALYS_USE_ME_WITH_DOUBLE
(
Softmax
);
#undef AWALYS_USE_ME_WITH_DOUBLE
...
...
@@ -244,6 +260,8 @@ REGISTER_MKL_KERNEL(kVAdd, VAdd);
REGISTER_MKL_KERNEL
(
kVScal
,
VScal
);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_MKL_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_MKL_KERNEL
(
kVBroadcast
,
VBroadcast
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
...
...
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
2c4fcaa6
...
...
@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
template
<
typename
T
>
void
VAXPY
(
T
a
,
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VBroadcast
(
const
T
*
x
,
T
*
y
,
int64_t
y_h
,
int64_t
x_len
)
{
for
(
int64_t
h
=
0
;
h
<
y_h
;
++
h
)
{
VCopy
(
x
,
y
+
h
*
x_len
,
x_len
);
}
}
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
...
...
@@ -192,6 +199,7 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VCopy
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
...
...
@@ -201,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
DECLARE_MKL_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_MKL_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_MKL_KERNEL
}
// namespace mkl
...
...
paddle/fluid/operators/jit/refer/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
...
@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
USE_JITKERNEL_REFER
(
kVSub
)
USE_JITKERNEL_REFER
(
kVScal
)
USE_JITKERNEL_REFER
(
kVAddBias
)
USE_JITKERNEL_REFER
(
kVCopy
)
USE_JITKERNEL_REFER
(
kVRelu
)
USE_JITKERNEL_REFER
(
kVIdentity
)
USE_JITKERNEL_REFER
(
kVExp
)
...
...
@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER
(
kSoftmax
)
USE_JITKERNEL_REFER
(
kEmbSeqPool
)
USE_JITKERNEL_REFER
(
kSgd
)
USE_JITKERNEL_REFER
(
kVBroadcast
)
paddle/fluid/operators/jit/refer/refer.cc
浏览文件 @
2c4fcaa6
...
...
@@ -30,6 +30,7 @@ REGISTER_REFER_KERNEL(kVScal, VScal);
REGISTER_REFER_KERNEL
(
kVAddBias
,
VAddBias
);
REGISTER_REFER_KERNEL
(
kVRelu
,
VRelu
);
REGISTER_REFER_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_REFER_KERNEL
(
kVIdentity
,
VIdentity
);
REGISTER_REFER_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_REFER_KERNEL
(
kVExp
,
VExp
);
...
...
@@ -61,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
REGISTER_REFER_KERNEL
(
kSgd
,
Sgd
);
REGISTER_REFER_KERNEL
(
kVBroadcast
,
VBroadcast
);
#undef REGISTER_REFER_KERNEL
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
2c4fcaa6
...
...
@@ -70,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) {
}
}
template
<
typename
T
>
void
VCopy
(
const
T
*
x
,
T
*
y
,
int
n
)
{
std
::
memcpy
(
y
,
x
,
n
*
sizeof
(
T
));
}
// x shape: (x_len)
// y shape: (h, x_len)
template
<
typename
T
>
void
VBroadcast
(
const
T
*
x
,
T
*
y
,
int64_t
y_h
,
int64_t
x_len
)
{
for
(
int64_t
h
=
0
;
h
<
y_h
;
++
h
)
{
VCopy
(
x
,
y
+
h
*
x_len
,
x_len
);
}
}
template
<
typename
T
>
void
VRelu
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
@@ -500,6 +514,7 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples);
DECLARE_REFER_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VCopy
,
XYNTuples
);
// lstm_t*, const lstm_attr_t*
DECLARE_REFER_KERNEL
(
LSTMCtHt
,
LSTMTuples
);
...
...
@@ -528,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
DECLARE_REFER_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_REFER_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_REFER_KERNEL
}
// namespace refer
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
2c4fcaa6
...
...
@@ -26,8 +26,8 @@ limitations under the License. */
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
template
<
typename
T
>
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
2
0
.
f
),
const
T
upper
=
static_cast
<
T
>
(
2
0
.
f
))
{
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
2.
f
),
const
T
upper
=
static_cast
<
T
>
(
2.
f
))
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
...
...
@@ -157,6 +157,26 @@ struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
}
};
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
VBroadcastTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>
,
int64_t
,
typename
jit
::
VBroadcastTuples
<
T
>::
attr_type
>
{
void
operator
()(
const
typename
jit
::
VBroadcastTuples
<
T
>::
func_type
tgt
,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
int64_t
h
,
const
typename
jit
::
VBroadcastTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
(),
static_cast
<
size_t
>
(
attr
));
EXPECT_EQ
(
yref
.
size
(),
x
.
size
()
*
h
);
std
::
vector
<
T
>
y
(
yref
.
size
());
const
T
*
x_data
=
x
.
data
();
const
T
*
yref_data
=
yref
.
data
();
T
*
y_data
=
y
.
data
();
tgt
(
x_data
,
y_data
,
h
,
attr
);
ExpectEQ
<
T
>
(
y_data
,
yref_data
,
yref
.
size
());
}
};
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
XYNTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
{
void
operator
()(
const
typename
jit
::
XYNTuples
<
T
>::
func_type
tgt
,
...
...
@@ -514,7 +534,7 @@ void TestKernelXRNTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
XRNTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
d
);
RandomVec
<
T
>
(
d
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
x
.
data
());
T
ref_res
;
ref
(
x
.
data
(),
&
ref_res
,
d
);
TestAllImpls
<
KT
,
jit
::
XRNTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
T
>
(
d
,
x
,
...
...
@@ -532,7 +552,7 @@ void TestKernelXYNTuples() {
std
::
vector
<
T
>
x
(
d
),
yref
(
d
);
std
::
vector
<
T
>
xinp
(
d
);
// inplace test
RandomVec
<
T
>
(
d
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
x
.
data
());
std
::
copy
(
x
.
begin
(),
x
.
end
(),
xinp
.
begin
());
const
T
*
x_data
=
x
.
data
();
...
...
@@ -566,7 +586,7 @@ void TestKernelLSTMTuples() {
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
xsrc
(
4
*
d
),
wp
(
3
*
d
),
ct_1
(
d
);
std
::
vector
<
T
>
ct_ref
(
d
),
ht_ref
(
d
),
checked
(
2
*
d
);
RandomVec
<
T
>
(
4
*
d
,
xsrc
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
4
*
d
,
xsrc
.
data
());
RandomVec
<
T
>
(
3
*
d
,
wp
.
data
(),
-
1.
f
,
1.
f
);
RandomVec
<
T
>
(
d
,
ct_1
.
data
(),
-
1.
f
,
1.
f
);
// x could be changed after compute, so copy to save src
...
...
@@ -614,8 +634,8 @@ void TestKernelGRUTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
GRUTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
xsrc
(
3
*
d
),
ht_1
(
d
),
ht_ref
(
d
);
RandomVec
<
T
>
(
3
*
d
,
xsrc
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
ht_1
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
3
*
d
,
xsrc
.
data
());
RandomVec
<
T
>
(
d
,
ht_1
.
data
());
// x could be changed after compute, so copy to save src
std
::
vector
<
T
>
x
(
xsrc
.
size
());
std
::
copy
(
xsrc
.
begin
(),
xsrc
.
end
(),
x
.
begin
());
...
...
@@ -651,7 +671,7 @@ void TestKernelSeqPoolTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SeqPoolTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
h
*
w
),
yref
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
T
*
yref_data
=
yref
.
data
();
ref
(
x_data
,
yref_data
,
&
attr
);
...
...
@@ -676,8 +696,8 @@ void TestKernelMatMulTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
MatMulTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
a
(
m
*
k
),
b
(
k
*
n
),
c
(
m
*
n
);
RandomVec
<
T
>
(
m
*
k
,
a
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
k
*
n
,
b
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
m
*
k
,
a
.
data
());
RandomVec
<
T
>
(
k
*
n
,
b
.
data
());
const
T
*
a_data
=
a
.
data
();
const
T
*
b_data
=
b
.
data
();
T
*
c_data
=
c
.
data
();
...
...
@@ -699,7 +719,7 @@ void TestKernelSoftmaxTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SoftmaxTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
bs
*
n
),
y
(
bs
*
n
);
RandomVec
<
T
>
(
bs
*
n
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
bs
*
n
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
T
*
y_data
=
y
.
data
();
...
...
@@ -726,7 +746,7 @@ void TestKernelEmbSeqPoolTuples() {
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
1000
));
for
(
int
tbl_w
:
test_sizes
)
{
std
::
vector
<
T
>
table
(
tbl_h
*
tbl_w
);
RandomVec
<
T
>
(
tbl_h
*
tbl_w
,
table
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
tbl_h
*
tbl_w
,
table
.
data
());
const
T
*
table_data
=
table
.
data
();
for
(
auto
type
:
pool_types
)
{
for
(
int
idx_w
:
{
1
,
2
,
10
,
16
})
{
...
...
@@ -772,14 +792,14 @@ void TestKernelSgdTuples() {
for
(
int
grad_w
:
TestSizes
())
{
std
::
vector
<
T
>
param
(
param_h
*
grad_w
);
std
::
vector
<
T
>
param_out
(
param_h
*
grad_w
);
RandomVec
<
T
>
(
param_h
*
grad_w
,
param
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
param_h
*
grad_w
,
param
.
data
());
const
T
*
param_data
=
param
.
data
();
T
*
out_data
=
param_out
.
data
();
for
(
int
rows_size
=
1
;
rows_size
<=
param_h
;
++
rows_size
)
{
std
::
vector
<
T
>
grad
(
rows_size
*
grad_w
);
std
::
vector
<
int64_t
>
rows
=
UnDuplicatedRandomVec
(
rows_size
,
0
,
rows_size
-
1
);
RandomVec
<
T
>
(
rows_size
*
grad_w
,
grad
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
rows_size
*
grad_w
,
grad
.
data
());
const
int64_t
*
rows_data
=
rows
.
data
();
const
T
*
grad_data
=
grad
.
data
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SgdTuples
<
T
>>
();
...
...
@@ -815,8 +835,8 @@ void TestKernelNCHW16CMulNCTuples() {
int
sz
=
n
*
c
*
h
*
w
;
std
::
vector
<
T
>
x
(
sz
),
y
(
n
*
c
),
zref
(
sz
);
std
::
vector
<
T
>
ztgt
(
sz
),
zjit
(
sz
);
RandomVec
<
T
>
(
sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
n
*
c
,
y
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
sz
,
x
.
data
());
RandomVec
<
T
>
(
n
*
c
,
y
.
data
());
const
T
*
x_data
=
x
.
data
();
const
T
*
y_data
=
y
.
data
();
...
...
@@ -873,11 +893,11 @@ void TestKernelLayerNormTuples() {
int
sz
=
left
*
right
;
std
::
vector
<
T
>
x
(
sz
),
mean
(
left
),
var
(
left
),
scale
(
right
),
bias
(
right
),
outref
(
sz
);
RandomVec
<
T
>
(
sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
left
,
mean
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
left
,
var
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
right
,
scale
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
right
,
bias
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
sz
,
x
.
data
());
RandomVec
<
T
>
(
left
,
mean
.
data
());
RandomVec
<
T
>
(
left
,
var
.
data
());
RandomVec
<
T
>
(
right
,
scale
.
data
());
RandomVec
<
T
>
(
right
,
bias
.
data
());
const
T
*
scale_data
=
scale
.
data
();
const
T
*
bias_data
=
bias
.
data
();
...
...
@@ -903,7 +923,7 @@ void TestKernelCRFDecodingTuples() {
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
constexpr
int
state_trans_base_idx
=
2
;
auto
test_sizes
=
TestSizes
();
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
1
000
));
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
2
000
));
for
(
int
seq_len
:
{
1
,
11
,
17
,
50
})
{
for
(
int
tag_num
:
test_sizes
)
{
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
CRFDecodingTuples
<
T
>>
();
...
...
@@ -912,8 +932,8 @@ void TestKernelCRFDecodingTuples() {
int
w_sz
=
(
tag_num
+
state_trans_base_idx
)
*
tag_num
;
std
::
vector
<
T
>
x
(
x_sz
),
w
(
w_sz
),
alpharef
(
x_sz
);
std
::
vector
<
int
>
trackref
(
x_sz
);
RandomVec
<
T
>
(
x_sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
w_sz
,
w
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
x_sz
,
x
.
data
());
RandomVec
<
T
>
(
w_sz
,
w
.
data
());
ref
(
seq_len
,
(
const
T
*
)
x
.
data
(),
(
const
T
*
)
w
.
data
(),
alpharef
.
data
(),
trackref
.
data
(),
tag_num
);
...
...
@@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() {
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestKernelVBroadcastTuples
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
for
(
int
w
:
TestSizes
())
{
std
::
vector
<
T
>
x
(
w
);
RandomVec
<
T
>
(
w
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
for
(
int64_t
h
:
{
1
,
2
,
6
})
{
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
VBroadcastTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
y
(
w
*
h
);
T
*
y_data
=
y
.
data
();
ref
(
x_data
,
y_data
,
h
,
w
);
TestAllImpls
<
KT
,
jit
::
VBroadcastTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>
,
int64_t
>
(
static_cast
<
int64_t
>
(
w
),
x
,
y
,
h
,
static_cast
<
int64_t
>
(
w
));
}
}
}
#define TEST_CPU_KERNEL(test_tuple, kernel_type) \
TEST(JITKernel, kernel_type) { \
TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
...
...
@@ -949,6 +990,7 @@ TEST_CPU_KERNEL(XYNTuples, kVSquare);
TEST_CPU_KERNEL
(
XYNTuples
,
kVExp
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVSigmoid
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVTanh
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVCopy
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMCtHt
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMC1H1
);
...
...
@@ -966,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
TEST_CPU_KERNEL
(
SgdTuples
,
kSgd
);
TEST_CPU_KERNEL
(
LayerNormTuples
,
kLayerNorm
);
TEST_CPU_KERNEL
(
CRFDecodingTuples
,
kCRFDecoding
);
TEST_CPU_KERNEL
(
VBroadcastTuples
,
kVBroadcast
);
TEST
(
JITKernel_key
,
lstm
)
{
jit
::
lstm_attr_t
attr1
(
8
,
jit
::
kVIdentity
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
...
...
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/requantize_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
mkldnn
::
memory
;
using
mkldnn
::
primitive
;
using
mkldnn
::
reorder
;
using
platform
::
to_void_cast
;
using
Tensor
=
framework
::
Tensor
;
using
framework
::
DataLayout
;
using
mkldnn
::
stream
;
using
platform
::
GetMKLDNNFormat
;
template
<
typename
T
>
class
ReQuantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
scale_in
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_out
=
ctx
.
Attr
<
float
>
(
"Scale_out"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MKLDNNDeviceContext
>();
const
auto
&
engine
=
dev_ctx
.
GetEngine
();
std
::
vector
<
primitive
>
pipeline
;
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
mkldnn
::
memory
::
data_type
dst_dt
=
src_dt
;
// TODO(Xiaoli) support
// requantize from different
// data type (e.g., s8 to u8)
mkldnn
::
memory
::
format
src_fmt
=
memory
::
format
::
nhwc
;
mkldnn
::
memory
::
format
dst_fmt
=
memory
::
format
::
nhwc
;
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
float
scale_shift
=
scale_out
/
scale_in
;
mkldnn
::
primitive_attr
attri
;
int
mask
=
0
;
attri
.
set_output_scales
(
mask
,
{
scale_shift
});
auto
src_md
=
platform
::
MKLDNNMemDesc
({
src_tz
},
src_dt
,
src_fmt
);
auto
src_pd
=
mkldnn
::
memory
::
primitive_desc
(
src_md
,
engine
);
auto
src_memory
=
std
::
make_shared
<
mkldnn
::
memory
>
(
src_pd
,
to_void_cast
<
T
>
(
input_data
));
std
::
shared_ptr
<
primitive
::
at
>
src_memory_p
=
std
::
shared_ptr
<
primitive
::
at
>
(
new
primitive
::
at
(
*
src_memory
));
auto
dst_md
=
platform
::
MKLDNNMemDesc
({
dst_tz
},
dst_dt
,
dst_fmt
);
auto
dst_pd
=
mkldnn
::
memory
::
primitive_desc
(
dst_md
,
engine
);
auto
dst_memory
=
mkldnn
::
memory
(
dst_pd
,
to_void_cast
<
T
>
(
output_data
));
auto
reorder_pd
=
std
::
shared_ptr
<
reorder
::
primitive_desc
>
(
new
reorder
::
primitive_desc
(
src_pd
,
dst_pd
,
attri
));
auto
reorder_p
=
std
::
shared_ptr
<
reorder
>
(
new
reorder
(
*
reorder_pd
,
*
src_memory_p
,
dst_memory
));
pipeline
.
push_back
(
*
reorder_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
dst_memory
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
requantize
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ReQuantOpKernel
<
int8_t
>
,
ops
::
ReQuantOpKernel
<
uint8_t
>
);
paddle/fluid/operators/recurrent_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase {
const
std
::
vector
<
std
::
string
>
&
src_vars
,
framework
::
Scope
*
dst_scope
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
Callback
callback
)
{
Callback
callback
,
bool
is_backward
=
false
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
,
is_backward
);
}
}
...
...
@@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase {
const
std
::
vector
<
std
::
string
>
&
src_vars
,
const
framework
::
Scope
&
dst_scope
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
Callback
callback
)
{
Callback
callback
,
bool
is_backward
=
false
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
,
is_backward
);
}
}
...
...
@@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase {
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
const
std
::
string
&
src_var_name
,
framework
::
Scope
*
dst_scope
,
const
std
::
string
&
dst_var_name
,
Callback
callback
)
{
const
std
::
string
&
dst_var_name
,
Callback
callback
,
bool
is_backward
=
false
)
{
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
);
if
(
is_backward
&&
src_var
==
nullptr
)
{
return
;
}
PADDLE_ENFORCE
(
src_var
!=
nullptr
,
"%s is not found."
,
src_var_name
);
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
*
dst_var
=
dst_scope
->
Var
(
dst_var_name
);
...
...
@@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase {
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
const
std
::
string
&
src_var_name
,
const
framework
::
Scope
&
dst_scope
,
const
std
::
string
&
dst_var_name
,
Callback
callback
)
{
const
std
::
string
&
dst_var_name
,
Callback
callback
,
bool
is_backward
=
false
)
{
auto
*
dst_var
=
dst_scope
.
FindVar
(
dst_var_name
);
if
(
is_backward
&&
dst_var
==
nullptr
)
{
return
;
}
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
,
"%s is not found."
,
src_var_name
);
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
*
dst_var
=
dst_scope
.
FindVar
(
dst_var_name
);
PADDLE_ENFORCE
(
dst_var
!=
nullptr
);
PADDLE_ENFORCE
(
dst_var
!=
nullptr
,
"%s is not found."
,
dst_var_name
);
auto
*
dst_tensor
=
dst_var
->
GetMutable
<
framework
::
LoDTensor
>
();
callback
(
src_tensor
,
dst_tensor
);
}
...
...
@@ -345,7 +357,8 @@ class RecurrentGradOp : public RecurrentBase {
auto
dims
=
framework
::
vectorize
(
inside
->
dims
());
dims
.
erase
(
dims
.
begin
());
inside
->
Resize
(
framework
::
make_ddim
(
dims
));
});
},
true
/*is_backward*/
);
auto
og_set
=
List2Set
(
Inputs
(
kOutputGrads
));
if
(
VLOG_IS_ON
(
10
))
{
...
...
@@ -454,7 +467,8 @@ class RecurrentGradOp : public RecurrentBase {
auto
dst
=
outside
->
Slice
(
seq_offset
,
seq_offset
+
1
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
&
dst
);
});
},
true
/*is_backward*/
);
VLOG
(
5
)
<<
"Link outside gradient finished "
;
if
(
step_id
+
1
==
seq_len
)
{
// at_end
...
...
@@ -467,7 +481,8 @@ class RecurrentGradOp : public RecurrentBase {
outside
->
Resize
(
inside
.
dims
());
outside
->
mutable_data
(
place
,
inside
.
type
());
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
outside
);
});
},
true
/*is_backward*/
);
VLOG
(
5
)
<<
"Link initialize state gradient finished "
;
}
scopes
.
Next
();
...
...
@@ -608,10 +623,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
std
::
vector
<
std
::
string
>
input
{
kInputs
,
kInitialStates
};
std
::
vector
<
std
::
string
>
output
{
kOutputs
};
for
(
auto
&
s
:
input
)
{
// NOTE(zcd): In some case, some of kInputs doesn't have gradient.
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
framework
::
GradVarName
(
s
)),
"Cannot find the gradient variable %s"
,
framework
::
GradVarName
(
s
));
}
for
(
auto
&
s
:
output
)
{
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
...
...
paddle/fluid/operators/requantize_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/requantize_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
framework
::
OpKernelType
ReQuantOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
=
framework
::
LibraryType
::
kMKLDNN
;
framework
::
DataLayout
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
);
}
void
ReQuantOpMaker
::
Make
()
{
AddInput
(
"Input"
,
"input data"
);
AddOutput
(
"Output"
,
"output data"
);
AddAttr
<
float
>
(
"Scale_in"
,
"scale in data"
).
SetDefault
({
1.0
f
});
AddAttr
<
float
>
(
"Scale_out"
,
"scale out data"
).
SetDefault
({
1.0
f
});
AddComment
(
R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC"
);
}
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
requantize
,
ops
::
ReQuantOp
,
ops
::
ReQuantOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle/fluid/operators/requantize_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
OpKernelType
;
using
framework
::
Tensor
;
class
ReQuantOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
ctx
->
SetOutputDim
(
"Output"
,
ctx
->
GetInputDim
(
"Input"
));
ctx
->
ShareLoD
(
"Input"
,
/*->*/
"Output"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
ReQuantOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reshape_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel {
static
framework
::
DDim
ValidateShape
(
const
std
::
vector
<
int
>
shape
,
const
framework
::
DDim
&
in_dims
)
{
const
int64_t
in_size
=
framework
::
product
(
in_dims
);
auto
in_dims_vec
=
framework
::
vectorize
(
in_dims
);
bool
all_positive
=
std
::
all_of
(
in_dims_vec
.
cbegin
(),
in_dims_vec
.
cend
(),
[](
int64_t
i
)
{
return
i
>
0
;
});
// only one dimension can be set to -1, whose size will be automatically
// infered.
const
int64_t
unk_dim_val
=
-
1
;
...
...
@@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
}
if
(
unk_dim_idx
!=
-
1
)
{
if
(
in_size
>
0
)
{
if
(
all_positive
)
{
// in_size < 0 and is un-determinate in compile time, skip the check,
// for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, in_size = -8, output_shape[0] = 0
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
浏览文件 @
2c4fcaa6
...
...
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"calibration_data"
,
"the calibration data for int8"
);
AddAttr
<
std
::
string
>
(
"engine_serialized_data"
,
"the serialized data contains the all info of the ICUDAEngine"
);
AddAttr
<
std
::
string
>
(
"engine_key"
,
"The engine_key here is used to distinguish different TRT Engines"
);
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
2c4fcaa6
...
...
@@ -16,8 +16,10 @@
#ifdef PADDLE_WITH_CUDA
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/executor.h"
...
...
@@ -31,37 +33,6 @@ namespace paddle {
namespace
operators
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
// NOLINT
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
case
FluidDT
::
VarType_Type_FP32
:
return
TRT_DT
::
kFLOAT
;
case
FluidDT
::
VarType_Type_INT32
:
return
TRT_DT
::
kINT32
;
default:
return
TRT_DT
::
kINT32
;
}
PADDLE_THROW
(
"unkown type"
);
return
TRT_DT
::
kINT32
;
}
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>
&
shape
)
{
PADDLE_ENFORCE_GT
(
shape
.
size
(),
1UL
,
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
"TensorRT' tensor input requires at most 4 dimensions"
);
PADDLE_ENFORCE
(
shape
.
size
()
==
4UL
||
shape
.
size
()
==
2UL
);
if
(
shape
.
size
()
==
4UL
)
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
shape
[
2
],
shape
[
3
]);
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
1
,
1
);
}
}
// namespace // NOLINT
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TensorRTEngine
;
using
inference
::
tensorrt
::
TRTInt8Calibrator
;
...
...
@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool
enable_int8_
;
std
::
string
calibration_data_
;
std
::
string
engine_key_
;
std
::
string
engine_serialized_data_
;
bool
calibration_mode_
;
public:
...
...
@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
enable_int8_
=
Attr
<
bool
>
(
"enable_int8"
);
calibration_data_
=
Attr
<
std
::
string
>
(
"calibration_data"
);
engine_key_
=
Attr
<
std
::
string
>
(
"engine_key"
);
engine_serialized_data_
=
Attr
<
std
::
string
>
(
"engine_serialized_data"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
for
(
const
auto
&
param
:
params
)
{
...
...
@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunCalibration
(
scope
,
dev_place
);
return
;
}
RunTrt
(
scope
,
dev_place
);
auto
*
trt_engine
=
GetEngine
(
scope
,
dev_place
);
RunTrt
(
scope
,
dev_place
,
trt_engine
);
}
void
RunCalibration
(
const
framework
::
Scope
&
scope
,
...
...
@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
LOG_FIRST_N
(
INFO
,
1
)
<<
"The TRT engine: "
<<
engine_key_
<<
" is running calibration trt int8... "
;
int
runtime_batch
=
1
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
!
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
(
engine_key_
))
{
TRTCalibratorEngine
*
calib_res
=
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Create
(
engine_key_
);
...
...
@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
calib_buffers
,
runtime_batch
,
engine_key_
,
dev_place
));
calib_res
->
thr_
.
reset
(
new
std
::
thread
([
&
]()
{
calib_res
->
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calib_res
->
calib_
.
get
()
));
max_batch_size_
,
workspace_size_
,
enable_int8_
,
calib_res
->
calib_
.
get
()
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
VLOG
(
3
)
<<
"start the calib trt engine thread"
;
Prepare
(
scope
,
dev_plac
e
,
calib_res
->
engine_
.
get
());
Prepare
TRTEngine
(
scop
e
,
calib_res
->
engine_
.
get
());
}));
}
...
...
@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunNativeImpl
(
scope
,
dev_place
);
}
void
RunTrt
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_plac
e
)
const
{
void
RunTrt
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
TensorRTEngine
*
engin
e
)
const
{
int
runtime_batch
=
1
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
trt_engine_
.
get
()
==
nullptr
)
{
trt_engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calibrator_
.
get
()));
Prepare
(
scope
,
dev_place
,
trt_engine_
.
get
());
}
auto
*
engine
=
trt_engine_
.
get
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
// Convert input tensor from fluid to engine.
int
num_inputs
=
0
;
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
num_inputs
+=
1
;
}
const
int
num_bindings
=
num_inputs
+
Outputs
(
"Ys"
).
size
();
std
::
vector
<
void
*>
buffers
(
num_bindings
);
// Bind input tensor to TRT.
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
// convert input and copy to TRT engine's buffer
...
...
@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
runtime_batch
=
t_shape
[
0
];
if
(
platform
::
is_cpu_place
(
t
.
place
()))
{
engine
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
else
{
engine
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
}
cudaStreamSynchronize
(
stream
);
PADDLE_ENFORCE_LE
(
runtime_batch
,
max_batch_size_
);
// Execute the engine.
engine
->
Execute
(
runtime_batch
);
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
x
.
c_str
());
PADDLE_ENFORCE
(
bind_index
<
num_bindings
,
"The bind index should be less than num_bindings"
);
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
t
.
data
<
float
>
());
}
//
Convert output tensor from engine to fluid
//
Bind output tensor to TRT.
int
output_index
=
0
;
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
VLOG
(
4
)
<<
y
;
// convert output and copy to fluid.
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
output_maps
[
output_index
]);
auto
dims
=
trt_t
->
getDimensions
();
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
output_maps
[
output_index
].
c_str
());
auto
dims
=
engine
->
engine
()
->
getBindingDimensions
(
bind_index
);
// Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim.
std
::
vector
<
int
>
ddim
;
...
...
@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
for
(
int
i
=
0
;
i
<
dims
.
nbDims
;
i
++
)
{
ddim
.
push_back
(
dims
.
d
[
i
]);
}
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
// TODO(Superjomn) change this float to dtype size.
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
runtime_batch
;
engine
->
GetOutputInGPU
(
output_maps
[
output_index
],
fluid_t
->
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
)),
size
*
sizeof
(
float
));
PADDLE_ENFORCE
(
bind_index
<
num_bindings
,
"The bind index should be less than num_bindings"
);
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
fluid_t
->
mutable_data
<
float
>
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
)));
output_index
+=
1
;
}
PADDLE_ENFORCE_LE
(
runtime_batch
,
max_batch_size_
);
// Execute the engine.
engine
->
Execute
(
runtime_batch
,
&
buffers
,
stream
);
cudaStreamSynchronize
(
stream
);
}
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
TensorRTEngine
*
GetEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
!
trt_engine_
)
{
trt_engine_
.
reset
(
new
inference
::
tensorrt
::
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
enable_int8_
,
calibrator_
.
get
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
if
(
!
engine_serialized_data_
.
empty
())
{
trt_engine_
->
Deserialize
(
engine_serialized_data_
);
}
else
{
PrepareTRTEngine
(
scope
,
trt_engine_
.
get
());
}
}
return
trt_engine_
.
get
();
}
void
PrepareTRTEngine
(
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
const
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
framework
::
proto
::
BlockDesc
block_desc
;
block_desc
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
framework
::
proto
::
BlockDesc
block_proto
;
block_proto
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
std
::
vector
<
std
::
string
>
output_maps
=
std
::
vector
<
std
::
string
>
inputs
=
Inputs
(
"Xs"
);
std
::
vector
<
std
::
string
>
outputs
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
engine
->
InitNetwork
();
framework
::
BlockDesc
block
(
nullptr
/*programdesc*/
,
&
block_desc
);
VLOG
(
4
)
<<
"parsed var size "
<<
block
.
AllVars
().
size
();
// Add inputs
VLOG
(
4
)
<<
"declare inputs"
;
for
(
auto
&
input
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
input
))
continue
;
VLOG
(
4
)
<<
"declare input "
<<
input
;
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
input
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
*
var
=
block
.
FindVar
(
input
);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
t_shape
));
}
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlock
(
block_desc
,
param_names_
,
scope
,
engine
);
// Add outputs
for
(
auto
&
output
:
output_maps
)
{
engine
->
DeclareOutput
(
output
);
}
engine
->
FreezeNetwork
();
.
ConvertBlockToTRTEngine
(
&
block_desc
,
scope
,
inputs
,
param_names_
,
outputs
,
engine
);
}
};
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
浏览文件 @
2c4fcaa6
...
...
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"engine_serialized_data"
,
std
::
string
(
""
));
LOG
(
INFO
)
<<
"create engine op"
;
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
...
...
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
std
::
vector
<
std
::
string
>
({
"z3"
}));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"engine_serialized_data"
,
std
::
string
(
""
));
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
2c4fcaa6
...
...
@@ -38,6 +38,22 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
DEFINE_double
(
initial_gpu_memory_in_mb
,
-
1.0
,
"GPU memory chunk size in MB."
"Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
"chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
"chunk when the first chunk is not enough. This flag has higher priority "
"than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0."
);
DEFINE_double
(
reallocate_gpu_memory_in_mb
,
-
1.0
,
"GPU memory chunk size in MB."
"If FLAGS_initial_gpu_memory_in_mb is set and "
"FLAGS_reallocate_gpu_memory_in_mb "
"is less than 0, it would be replaced by "
"FLAGS_initial_gpu_memory_in_mb. Disable "
"when FLAGS_initial_gpu_memory_in_mb is less than 0."
);
DEFINE_bool
(
enable_cublas_tensor_op_math
,
false
,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
...
...
@@ -211,13 +227,54 @@ size_t GpuMaxChunkSize() {
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
(
total
-
reserving
));
PADDLE_ENFORCE_LE
(
allocating
,
available
,
"Insufficient GPU memory to allocation."
);
return
allocating
;
}
size_t
GpuFirstAllocateChunkSize
()
{
if
(
FLAGS_initial_gpu_memory_in_mb
<=
0
)
{
return
GpuMaxChunkSize
();
}
size_t
total
=
0
;
size_t
available
=
0
;
GpuMemoryUsage
(
&
available
,
&
total
);
VLOG
(
10
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
size_t
initial_mem
=
static_cast
<
size_t
>
(
FLAGS_initial_gpu_memory_in_mb
*
(
1
<<
20
));
PADDLE_ENFORCE_LE
(
initial_mem
,
available
,
"Insufficient GPU memory to allocation."
);
return
initial_mem
;
}
size_t
GpuReAllocateChunkSize
()
{
if
(
FLAGS_initial_gpu_memory_in_mb
<=
0
)
{
return
GpuMaxChunkSize
();
}
double
reallocate_mem
=
FLAGS_reallocate_gpu_memory_in_mb
;
if
(
reallocate_mem
<
0
)
{
PADDLE_ENFORCE
(
FLAGS_initial_gpu_memory_in_mb
>
0
,
"FLAGS_init_gpu_memory_to_use_mb must be larger than 0"
);
reallocate_mem
=
FLAGS_initial_gpu_memory_in_mb
;
}
size_t
total
=
0
;
size_t
available
=
0
;
GpuMemoryUsage
(
&
available
,
&
total
);
VLOG
(
10
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
size_t
realloc_mem
=
static_cast
<
size_t
>
(
reallocate_mem
*
(
1
<<
20
));
PADDLE_ENFORCE_LE
(
realloc_mem
,
available
,
"Insufficient GPU memory to allocation."
);
return
realloc_mem
;
}
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
),
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
2c4fcaa6
...
...
@@ -66,6 +66,12 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
size_t
GpuMaxChunkSize
();
//! Get init chunk size for GPU buddy allocator.
size_t
GpuFirstAllocateChunkSize
();
//! Get reallocate chunk size for GPU buddy allocator.
size_t
GpuReAllocateChunkSize
();
//! Copy memory from address src to dst asynchronously.
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
);
...
...
paddle/fluid/platform/temporary_allocator.cc
浏览文件 @
2c4fcaa6
...
...
@@ -77,6 +77,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
}
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
temp_allocation
->
ptr
()
<<
" size: "
<<
temp_allocation
->
size
();
alloc
::
AllocationDeleter
()(
temp_allocation
);
}
size_t
TemporaryAllocator
::
TemporaryAllocationQueueSize
()
{
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
2c4fcaa6
...
...
@@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
.
def
(
"enable_tensorrt_engine"
,
&
AnalysisConfig
::
EnableTensorRtEngine
,
py
::
arg
(
"workspace_size"
)
=
1
<<
20
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"min_subgraph_size"
)
=
3
,
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
)
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
,
py
::
arg
(
"use_static"
)
=
true
)
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
py
::
arg
(
"x"
)
=
true
)
...
...
python/paddle/fluid/__init__.py
浏览文件 @
2c4fcaa6
...
...
@@ -159,6 +159,7 @@ def __bootstrap__():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'initial_gpu_memory_in_mb'
,
'reallocate_gpu_memory_in_mb'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
...
...
python/paddle/fluid/imperative/layer_object_helper.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
copy
import
six
from
..framework
import
Parameter
,
_in_imperative_mode
from
..param_attr
import
ParamAttr
from
..
import
core
from
six.moves
import
zip
from
..layer_helper_base
import
LayerHelperBase
class
LayerObjectHelper
(
LayerHelperBase
):
def
__init__
(
self
,
name
):
super
(
LayerObjectHelper
,
self
).
__init__
(
name
,
layer_type
=
name
)
def
append_op
(
self
,
type
=
None
,
inputs
=
None
,
outputs
=
None
,
attrs
=
None
,
stop_gradient
=
None
):
"""append an operator for this layer object.
Args:
type: operator type
inputs: input variable of the operator
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
return
self
.
main_program
.
current_block
().
append_op
(
type
=
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
stop_gradient
)
def
_multiple_input
(
self
,
inputs_in
):
inputs
=
inputs_in
ret
=
[]
if
isinstance
(
inputs
,
(
list
,
tuple
)):
for
inp
in
inputs
:
ret
.
append
(
self
.
to_variable
(
inp
))
else
:
ret
.
append
(
self
.
to_variable
(
inputs
))
return
ret
# TODO: make it public when we need it
def
_input
(
self
,
inputs_in
):
inputs
=
self
.
_multiple_input
(
inputs_in
)
if
len
(
inputs
)
!=
1
:
raise
"{0} layer only takes one input"
.
format
(
self
.
layer_type
)
return
inputs
[
0
]
def
_multiple_param_attr
(
self
,
length
,
param_attr_in
=
None
):
param_attr
=
param_attr_in
if
isinstance
(
param_attr
,
ParamAttr
):
param_attr
=
[
param_attr
]
if
len
(
param_attr
)
!=
1
and
len
(
param_attr
)
!=
length
:
raise
ValueError
(
"parameter number mismatch"
)
elif
len
(
param_attr
)
==
1
and
length
!=
1
:
tmp
=
[
None
]
*
length
for
i
in
six
.
moves
.
range
(
length
):
tmp
[
i
]
=
copy
.
deepcopy
(
param_attr
[
0
])
param_attr
=
tmp
return
param_attr
def
iter_inputs_and_params
(
self
,
inputs_in
,
param_attr_in
=
None
):
"""Access all inputs and params one by one
Args:
inputs_in: inputs to be iter
param_attr_in: param_attr to be iter
Returns input, param_attr
"""
inputs
=
inputs_in
if
(
inputs_in
is
not
None
)
else
[]
inputs
=
self
.
_multiple_input
(
inputs
)
param_attrs
=
self
.
_multiple_param_attr
(
len
(
inputs
),
param_attr_in
)
for
ipt
,
param_attr
in
zip
(
inputs
,
param_attrs
):
yield
ipt
,
param_attr
def
input_dtype
(
self
,
inputs_in
):
"""Get input data type
Args:
inputs_in: inputs wanted know the data type
Returns dtype of the input
"""
inputs
=
self
.
_multiple_input
(
inputs_in
)
dtype
=
None
for
each
in
inputs
:
if
dtype
is
None
:
dtype
=
each
.
dtype
elif
dtype
!=
each
.
dtype
:
raise
ValueError
(
"Data Type mismatch: %d to %d"
%
(
dtype
,
each
.
dtype
))
return
dtype
def
get_parameter
(
self
,
name
):
"""Get parameter specifically
Args:
name: parameter's name
Returns target parameter
"""
param
=
self
.
main_program
.
global_block
().
var
(
name
)
if
not
isinstance
(
param
,
Parameter
):
raise
ValueError
(
"no Parameter name %s found"
%
name
)
return
param
def
append_bias_op
(
self
,
input_var
,
dim_start
=
1
,
dim_end
=
None
,
bias_attr
=
None
):
"""Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var
Args:
input_var: the input variable. The len(input_var.shape) is
larger or equal than 2.
dim_start:
dim_end: the shape of the bias will be
bias_attr: the bias_attr of it
Return the Variable of after append bias op
"""
size
=
list
(
input_var
.
shape
[
dim_start
:
dim_end
])
bias_attr
=
bias_attr
if
not
bias_attr
:
return
input_var
b
=
self
.
create_parameter
(
attr
=
bias_attr
,
shape
=
size
,
dtype
=
input_var
.
dtype
,
is_bias
=
True
)
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
type
=
'elementwise_add'
,
inputs
=
{
'X'
:
[
input_var
],
'Y'
:
[
b
]},
outputs
=
{
'Out'
:
[
tmp
]},
attrs
=
{
'axis'
:
dim_start
})
return
tmp
# TODO: this should not be called anymore after all activation func move to Layers
def
append_activation
(
self
,
input_var
,
act
=
None
,
use_cudnn
=
None
,
use_mkl_dnn
=
None
):
"""Append activation
Args:
input_var: the input variable. The len(input_var.shape) is
larger or equal than 2.
act: activation type
use_mkl_dnn: if use mkldnn
use_cudnn: if use cudnn
Return the Variable of after append activation
"""
act
=
act
if
act
is
None
:
return
input_var
if
isinstance
(
act
,
six
.
string_types
):
act
=
{
'type'
:
act
}
else
:
raise
TypeError
(
str
(
act
)
+
" should be unicode or str"
)
if
(
use_cudnn
is
not
None
)
and
use_cudnn
:
act
[
'use_cudnn'
]
=
use_cudnn
if
(
use_mkl_dnn
is
not
None
)
and
use_mkl_dnn
:
act
[
'use_mkldnn'
]
=
use_mkl_dnn
act_type
=
act
.
pop
(
'type'
)
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if
not
_in_imperative_mode
()
and
core
.
IsInplace
(
act_type
):
tmp
=
input_var
else
:
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
type
=
act_type
,
inputs
=
{
"X"
:
[
input_var
]},
outputs
=
{
"Out"
:
[
tmp
]},
attrs
=
act
)
return
tmp
def
is_instance
(
self
,
param
,
cls
):
"""Check if the input parameter is instance of input class
Args:
param: parameter to be check
cls: class of the parameter
Return result of the check (True or False)
"""
param
=
param
if
not
isinstance
(
param
,
cls
):
raise
TypeError
(
"The input {0} parameter of method {1} must be {2}"
,
param
,
self
.
layer_type
,
cls
.
__name__
)
python/paddle/fluid/imperative/layers.py
浏览文件 @
2c4fcaa6
...
...
@@ -19,8 +19,8 @@ import numpy as np
import
collections
from
..
import
unique_name
from
paddle.fluid
import
core
from
.layer_object_helper
import
LayerObjectHelper
from
paddle.fluid
import
framework
from
paddle.fluid.imperative
import
base
__all__
=
[
'Layer'
,
'PyLayer'
]
...
...
@@ -44,6 +44,8 @@ class Layer(core.Layer):
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
self
.
_helper
=
LayerObjectHelper
(
self
.
_full_name
)
def
full_name
(
self
):
"""Full name for this layers.
...
...
@@ -53,6 +55,51 @@ class Layer(core.Layer):
"""
return
self
.
_full_name
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
"""Create parameters for this layers.
Args:
attr: [ParamAttr] should be the parameter attribute for this parameter
shape: shape of the paramter
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
return
self
.
_helper
.
create_parameter
(
attr
,
shape
,
dtype
,
is_bias
,
default_initializer
)
# TODO: Add more parameter list when we need them
def
create_variable
(
self
,
name
=
None
,
persistable
=
None
,
dtype
=
None
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
):
"""Create Variable for this layers.
Args:
name: name of the variable
persistable: if set this variable persistable
dtype: data type of data in the variable
type: type of the variable
Returns created Variable.
"""
if
name
is
not
None
:
var_name
=
"."
.
join
([
self
.
_full_name
,
name
])
else
:
var_name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
_full_name
,
"_generated_var"
]))
return
self
.
_helper
.
main_program
.
current_block
().
create_var
(
name
=
var_name
,
persistable
=
persistable
,
dtype
=
dtype
,
type
=
type
)
def
parameters
(
self
,
include_sublayers
=
True
):
"""Returns a list of Parameters from current and sub-layers.
...
...
python/paddle/fluid/imperative/nn.py
浏览文件 @
2c4fcaa6
...
...
@@ -41,21 +41,12 @@ class Conv2D(layers.Layer):
bias_attr
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
assert
param_attr
is
not
False
,
"param_attr should not be False here."
super
(
Conv2D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
# TODO(minqiyang): Move this to the top.
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
dtype
=
dtype
,
act
=
act
)
super
(
Conv2D
,
self
).
__init__
(
name_scope
)
self
.
_groups
=
groups
self
.
_stride
=
utils
.
convert_to_list
(
stride
,
2
,
'stride'
)
self
.
_padding
=
utils
.
convert_to_list
(
padding
,
2
,
'padding'
)
self
.
_dilation
=
utils
.
convert_to_list
(
dilation
,
2
,
'dilation'
)
self
.
_act
=
act
if
not
isinstance
(
use_cudnn
,
bool
):
raise
ValueError
(
"use_cudnn should be True or False"
)
self
.
_use_cudnn
=
use_cudnn
...
...
@@ -80,28 +71,28 @@ class Conv2D(layers.Layer):
std
=
(
2.0
/
filter_elem_num
)
**
0.5
return
Normal
(
0.0
,
std
,
0
)
self
.
_filter_param
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
self
.
_filter_param
=
self
.
create_parameter
(
attr
=
param_attr
,
shape
=
filter_shape
,
dtype
=
self
.
_dtype
,
default_initializer
=
_get_default_param_initializer
())
if
self
.
_use_cudnn
:
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNFwdAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNBwdDataAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNBwdFilterAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_bias_param
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
bias_attr
,
self
.
_bias_param
=
self
.
create_parameter
(
attr
=
bias_attr
,
shape
=
[
num_filters
],
dtype
=
self
.
_dtype
,
is_bias
=
True
)
...
...
@@ -137,7 +128,7 @@ class Conv2D(layers.Layer):
attrs
=
{
'axis'
:
1
})
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_act
)
return
self
.
_helper
.
append_activation
(
pre_act
,
act
=
self
.
_act
)
class
Pool2D
(
layers
.
Layer
):
...
...
@@ -167,9 +158,6 @@ class Pool2D(layers.Layer):
super
(
Pool2D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
dtype
=
dtype
)
self
.
_pool_type
=
pool_type
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
self
.
_pool_padding
=
utils
.
convert_to_list
(
pool_padding
,
2
,
...
...
@@ -216,28 +204,25 @@ class FC(layers.Layer):
self
.
_size
=
size
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_dtype
=
dtype
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
self
.
_param_attr
=
param_attr
self
.
_bias_attr
=
param_attr
self
.
_act
=
act
def
_build_once
(
self
,
input
):
input_shape
=
input
.
shape
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
self
.
_num_flatten_dims
:],
1
)
]
+
[
self
.
_size
]
self
.
_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_
helper
.
param_attr
,
self
.
_w
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
if
self
.
_
helper
.
bias
_attr
:
if
self
.
_
param
_attr
:
size
=
list
([
self
.
_size
])
self
.
_b
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_
helper
.
bias
_attr
,
self
.
_b
=
self
.
create_parameter
(
attr
=
self
.
_
param
_attr
,
shape
=
size
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
...
...
@@ -275,7 +260,7 @@ class FC(layers.Layer):
else
:
pre_activation
=
pre_bias
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_activation
)
return
self
.
_helper
.
append_activation
(
pre_activation
,
act
=
self
.
_act
)
class
BatchNorm
(
layers
.
Layer
):
...
...
@@ -297,16 +282,12 @@ class BatchNorm(layers.Layer):
fuse_with_relu
=
False
,
use_global_stats
=
False
):
super
(
BatchNorm
,
self
).
__init__
(
name_scope
)
self
.
_param_attr
=
param_attr
self
.
_param_attr
=
bias_attr
self
.
_act
=
act
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
else
:
...
...
@@ -315,23 +296,23 @@ class BatchNorm(layers.Layer):
param_shape
=
[
num_channels
]
# create parameter
self
.
_scale
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_
helper
.
param_attr
,
self
.
_scale
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
default_initializer
=
Constant
(
1.0
))
if
use_global_stats
and
self
.
_
helper
.
param_attr
.
learning_rate
==
0.
:
if
use_global_stats
and
self
.
_param_attr
.
learning_rate
==
0.
:
self
.
_scale
.
_stop_gradient
=
True
self
.
_bias
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_
helper
.
bias
_attr
,
self
.
_bias
=
self
.
create_parameter
(
attr
=
self
.
_
param
_attr
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
if
use_global_stats
and
self
.
_
helper
.
bias
_attr
.
learning_rate
==
0.
:
if
use_global_stats
and
self
.
_
param
_attr
.
learning_rate
==
0.
:
self
.
_bias
.
_stop_gradient
=
True
self
.
_mean
=
self
.
_helper
.
create_parameter
(
self
.
_mean
=
self
.
create_parameter
(
attr
=
ParamAttr
(
name
=
moving_mean_name
,
initializer
=
Constant
(
0.0
),
...
...
@@ -341,7 +322,7 @@ class BatchNorm(layers.Layer):
dtype
=
self
.
_dtype
)
self
.
_mean
.
_stop_gradient
=
True
self
.
_variance
=
self
.
_helper
.
create_parameter
(
self
.
_variance
=
self
.
create_parameter
(
attr
=
ParamAttr
(
name
=
moving_variance_name
,
initializer
=
Constant
(
1.0
),
...
...
@@ -401,7 +382,7 @@ class BatchNorm(layers.Layer):
})
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
batch_norm_out
)
return
self
.
_helper
.
append_activation
(
batch_norm_out
,
self
.
_act
)
class
Embedding
(
layers
.
Layer
):
...
...
@@ -466,9 +447,7 @@ class Embedding(layers.Layer):
if
self
.
_remote_prefetch
:
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
)
self
.
_w
=
self
.
_helper
.
create_parameter
(
self
.
_w
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
self
.
_size
,
dtype
=
self
.
_dtype
,
...
...
python/paddle/fluid/initializer.py
浏览文件 @
2c4fcaa6
...
...
@@ -19,7 +19,6 @@ import numpy as np
from
.wrapped_decorator
import
signature_safe_contextmanager
from
.core
import
VarDesc
from
.
import
unique_name
from
.imperative
import
base
as
imperative_base
__all__
=
[
'Constant'
,
'Uniform'
,
'Normal'
,
'TruncatedNormal'
,
'Xavier'
,
'Bilinear'
,
...
...
@@ -166,7 +165,7 @@ class ConstantInitializer(Initializer):
'force_cpu'
:
self
.
_force_cpu
or
force_init_on_cpu
()
},
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -246,7 +245,7 @@ class UniformInitializer(Initializer):
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -325,7 +324,7 @@ class NormalInitializer(Initializer):
outputs
=
{
"Out"
:
var
},
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -404,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
outputs
=
{
"Out"
:
var
},
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -510,7 +509,7 @@ class XavierInitializer(Initializer):
"seed"
:
self
.
_seed
},
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -611,7 +610,7 @@ class MSRAInitializer(Initializer):
"seed"
:
self
.
_seed
},
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -710,7 +709,7 @@ class BilinearInitializer(Initializer):
'shape'
:
list
(
shape
),
value_name
:
values
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
@@ -769,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
value_name
:
values
},
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
return
op
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
2c4fcaa6
...
...
@@ -15,45 +15,29 @@
from
__future__
import
print_function
import
copy
import
itertools
import
six
import
sys
import
numpy
as
np
from
.framework
import
Variable
,
Parameter
,
default_main_program
,
default_startup_program
,
dtype_is_floating
,
_in_imperative_mode
from
.framework
import
Parameter
,
dtype_is_floating
,
_in_imperative_mode
from
.
import
unique_name
from
paddle.fluid.imperative
import
base
as
imperative_base
from
paddle.fluid.initializer
import
Constant
,
Xavier
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.param_attr
import
ParamAttr
from
.
import
core
from
six.moves
import
zip
from
.layer_helper_base
import
LayerHelperBase
class
LayerHelper
(
object
):
class
LayerHelper
(
LayerHelperBase
):
def
__init__
(
self
,
layer_type
,
**
kwargs
):
self
.
kwargs
=
kwargs
self
.
layer_type
=
layer_type
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
# TODO(panyx0718, minqiyang): imperative mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode.
if
name
is
None
:
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
self
.
layer_type
)
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
layer_type
)
@
property
def
name
(
self
):
return
self
.
kwargs
[
'name'
]
@
property
def
main_program
(
self
):
return
default_main_program
()
@
property
def
startup_program
(
self
):
return
default_startup_program
()
def
to_variable
(
self
,
x
):
return
imperative_base
.
to_variable
(
x
,
self
.
main_program
.
current_block
())
super
(
LayerHelper
,
self
).
__init__
(
self
.
kwargs
[
'name'
],
layer_type
=
layer_type
)
def
append_op
(
self
,
*
args
,
**
kwargs
):
return
self
.
main_program
.
current_block
().
append_op
(
*
args
,
**
kwargs
)
...
...
@@ -82,6 +66,7 @@ class LayerHelper(object):
def
bias_attr
(
self
):
return
ParamAttr
.
_to_attr
(
self
.
kwargs
.
get
(
'bias_attr'
,
None
))
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of param_attr
def
multiple_param_attr
(
self
,
length
):
param_attr
=
self
.
param_attr
if
isinstance
(
param_attr
,
ParamAttr
):
...
...
@@ -113,297 +98,13 @@ class LayerHelper(object):
(
dtype
,
each
.
dtype
))
return
dtype
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
,
reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def
__norm_op
(
x
,
out
=
None
,
p
=
2
,
dim
=
None
,
keep_dim
=
False
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
abs_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_abs'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'abs'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
abs_out
})
pow_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_pow'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
abs_out
},
outputs
=
{
'Out'
:
pow_out
},
attrs
=
{
'factor'
:
float
(
p
)})
sum_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_sum'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reduce_sum'
,
inputs
=
{
'X'
:
pow_out
},
outputs
=
{
'Out'
:
sum_out
},
attrs
=
{
'dim'
:
dim
,
'keep_dim'
:
keep_dim
,
'reduce_all'
:
True
if
dim
is
None
else
False
})
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
sum_out
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'factor'
:
1.
/
p
})
return
out
def
__reshape_op
(
x
,
shape
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_reshape'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reshape'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'shape'
:
shape
})
return
out
def
__transpose_op
(
x
,
axis
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_transpose'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'transpose'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'axis'
:
axis
})
return
out
def
__norm_except_dim
(
x
,
out
=
None
,
dim
=
None
,
block
=
self
.
startup_program
.
global_block
()):
"""Computes the norm over all dimensions except dim"""
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
if
dim
is
None
:
__norm_op
(
x
,
out
,
dim
=
dim
,
block
=
block
)
elif
dim
==
0
:
out_shape
=
[
x
.
shape
[
0
]]
+
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
reshape
=
__reshape_op
(
x
,
shape
=
[
x
.
shape
[
0
],
-
1
],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
1
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
elif
dim
==
len
(
x
.
shape
)
-
1
:
out_shape
=
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
+
[
x
.
shape
[
-
1
]]
reshape
=
__reshape_op
(
x
,
shape
=
[
-
1
,
x
.
shape
[
-
1
]],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
0
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
else
:
perm
=
list
(
range
(
len
(
x
.
shape
)))
perm
[
0
],
perm
[
dim
]
=
dim
,
0
transpose
=
__transpose_op
(
x
,
perm
,
block
=
block
)
norm
=
__norm_op
(
transpose
,
dim
=
0
,
block
=
block
)
__transpose_op
(
norm
,
perm
,
out
=
out
,
block
=
block
)
return
out
def
__weight_normalize
(
g
,
v
,
dim
):
"""Calculations for weight normalization"""
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
())
scale
=
elementwise_div
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w
=
elementwise_mul
(
x
=
v
,
y
=
scale
if
dim
is
None
else
reshape
(
x
=
scale
,
shape
=
[
v
.
shape
[
dim
]]),
axis
=-
1
if
dim
is
None
else
dim
)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return
w
g_param_attr
=
copy
.
deepcopy
(
attr
)
g_param_attr
.
name
=
attr
.
name
+
'_g'
g_param_shape
=
[
1
]
*
len
(
shape
)
if
attr
.
dim
is
not
None
:
g_param_shape
[
attr
.
dim
]
=
shape
[
attr
.
dim
]
v_param_attr
=
copy
.
deepcopy
(
attr
)
v_param_attr
.
name
=
attr
.
name
+
'_v'
v_param_shape
=
shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
(
with_initializer
=
False
))
v_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
(
with_initializer
=
True
))
__norm_except_dim
(
x
=
v_param
,
out
=
g_param
,
dim
=
attr
.
dim
,
block
=
self
.
startup_program
.
global_block
())
# Add weight normalization to main_program
g_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
())
v_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
())
w_param
=
__weight_normalize
(
g_param
,
v_param
,
dim
=
attr
.
dim
)
return
w_param
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
# Deepcopy the attr so that parameters can be shared in program
attr
=
copy
.
deepcopy
(
attr
)
assert
isinstance
(
attr
,
ParamAttr
)
suffix
=
'b'
if
is_bias
else
'w'
if
attr
.
name
is
None
:
attr
.
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
suffix
]))
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
:
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
else
:
if
not
(
dtype
.
startswith
(
"float"
)
or
dtype
==
"double"
):
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
if
is_bias
:
attr
.
_set_default_bias_initializer
()
else
:
attr
.
_set_default_param_initializer
()
else
:
attr
.
_set_default_initializer
(
default_initializer
)
# If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if
isinstance
(
attr
,
WeightNormParamAttr
):
param
=
self
.
_create_weight_normalize
(
attr
,
shape
,
dtype
)
WeightNormParamAttr
.
params_with_weight_norm
.
append
(
param
)
return
param
if
_in_imperative_mode
():
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
else
:
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
())
def
get_parameter
(
self
,
name
):
param
=
self
.
main_program
.
global_block
().
var
(
name
)
if
not
isinstance
(
param
,
Parameter
):
raise
ValueError
(
"no Parameter name %s found"
%
name
)
return
param
def
create_variable_for_type_inference
(
self
,
dtype
,
stop_gradient
=
False
):
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return
self
.
main_program
.
current_block
().
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
'tmp'
])),
dtype
=
dtype
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
stop_gradient
)
def
create_variable
(
self
,
*
args
,
**
kwargs
):
return
self
.
main_program
.
current_block
().
create_var
(
*
args
,
**
kwargs
)
def
create_global_variable
(
self
,
persistable
=
False
,
*
args
,
**
kwargs
):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return
self
.
main_program
.
global_block
().
create_var
(
*
args
,
persistable
=
persistable
,
**
kwargs
)
def
create_or_get_global_variable
(
self
,
name
,
*
args
,
**
kwargs
):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if
self
.
main_program
.
global_block
().
has_var
(
name
):
return
self
.
main_program
.
global_block
().
var
(
name
),
False
else
:
return
self
.
create_global_variable
(
name
=
name
,
*
args
,
**
kwargs
),
True
def
set_variable_initializer
(
self
,
var
,
initializer
):
assert
isinstance
(
var
,
Variable
)
if
imperative_base
.
enabled
():
initializer
(
var
,
var
.
block
)
else
:
self
.
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
persistable
=
True
,
initializer
=
initializer
)
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of bias_attr
def
append_bias_op
(
self
,
input_var
,
dim_start
=
1
,
dim_end
=
None
):
"""
Append bias operator and return its output. If the user does not set
...
...
@@ -434,6 +135,7 @@ class LayerHelper(object):
attrs
=
{
'axis'
:
dim_start
})
return
tmp
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act
def
append_activation
(
self
,
input_var
):
act
=
self
.
kwargs
.
get
(
'act'
,
None
)
if
act
is
None
:
...
...
@@ -448,10 +150,11 @@ class LayerHelper(object):
if
'use_mkldnn'
in
self
.
kwargs
:
act
[
'use_mkldnn'
]
=
self
.
kwargs
.
get
(
'use_mkldnn'
)
act_type
=
act
.
pop
(
'type'
)
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if
not
imperative_base
.
enabled
()
and
core
.
IsInplace
(
act_type
):
if
not
_in_imperative_mode
()
and
core
.
IsInplace
(
act_type
):
tmp
=
input_var
else
:
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
...
...
@@ -462,6 +165,7 @@ class LayerHelper(object):
attrs
=
act
)
return
tmp
#TODO (jiabin): should we remove this since it has never be used
def
_get_default_initializer
(
self
,
dtype
):
if
dtype
is
None
or
dtype_is_floating
(
dtype
)
is
True
:
return
Xavier
()
...
...
@@ -469,6 +173,7 @@ class LayerHelper(object):
# For integer and boolean types, initialize with all zeros
return
Constant
()
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
def
is_instance
(
self
,
param_name
,
cls
):
param
=
self
.
kwargs
.
get
(
param_name
,
None
)
if
not
isinstance
(
param
,
cls
):
...
...
python/paddle/fluid/layer_helper_base.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
copy
import
numpy
as
np
from
.framework
import
Variable
,
default_main_program
,
default_startup_program
,
_in_imperative_mode
,
_current_expected_place
from
.
import
unique_name
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.
import
core
class
LayerHelperBase
(
object
):
def
__init__
(
self
,
name
,
layer_type
):
self
.
_layer_type
=
layer_type
self
.
_name
=
name
@
property
def
name
(
self
):
return
self
.
_name
@
property
def
layer_type
(
self
):
return
self
.
_layer_type
@
property
def
main_program
(
self
):
return
default_main_program
()
@
property
def
startup_program
(
self
):
return
default_startup_program
()
def
to_variable
(
self
,
value
,
block
=
None
):
"""convert value to variable
Args:
value: value to be convert
block: the block of the variable
Return Variable construct from value
"""
if
isinstance
(
value
,
np
.
ndarray
):
assert
_in_imperative_mode
(
),
"to_variable could only be called in imperative mode"
if
not
block
:
block
=
default_main_program
().
current_block
()
py_var
=
Variable
(
block
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
name
=
None
,
shape
=
value
.
shape
,
dtype
=
value
.
dtype
)
var
=
py_var
.
_ivar
.
value
()
tensor
=
var
.
get_tensor
()
tensor
.
set
(
value
,
_current_expected_place
())
return
py_var
elif
isinstance
(
value
,
Variable
):
return
value
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
,
reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def
__norm_op
(
x
,
out
=
None
,
p
=
2
,
dim
=
None
,
keep_dim
=
False
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
abs_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_abs'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'abs'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
abs_out
})
pow_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_pow'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
abs_out
},
outputs
=
{
'Out'
:
pow_out
},
attrs
=
{
'factor'
:
float
(
p
)})
sum_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_sum'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reduce_sum'
,
inputs
=
{
'X'
:
pow_out
},
outputs
=
{
'Out'
:
sum_out
},
attrs
=
{
'dim'
:
dim
,
'keep_dim'
:
keep_dim
,
'reduce_all'
:
True
if
dim
is
None
else
False
})
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
sum_out
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'factor'
:
1.
/
p
})
return
out
def
__reshape_op
(
x
,
shape
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_reshape'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reshape'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'shape'
:
shape
})
return
out
def
__transpose_op
(
x
,
axis
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_transpose'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'transpose'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'axis'
:
axis
})
return
out
def
__norm_except_dim
(
x
,
out
=
None
,
dim
=
None
,
block
=
self
.
startup_program
.
global_block
()):
"""Computes the norm over all dimensions except dim"""
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
if
dim
is
None
:
__norm_op
(
x
,
out
,
dim
=
dim
,
block
=
block
)
elif
dim
==
0
:
out_shape
=
[
x
.
shape
[
0
]]
+
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
reshape
=
__reshape_op
(
x
,
shape
=
[
x
.
shape
[
0
],
-
1
],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
1
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
elif
dim
==
len
(
x
.
shape
)
-
1
:
out_shape
=
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
+
[
x
.
shape
[
-
1
]]
reshape
=
__reshape_op
(
x
,
shape
=
[
-
1
,
x
.
shape
[
-
1
]],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
0
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
else
:
perm
=
list
(
range
(
len
(
x
.
shape
)))
perm
[
0
],
perm
[
dim
]
=
dim
,
0
transpose
=
__transpose_op
(
x
,
perm
,
block
=
block
)
norm
=
__norm_op
(
transpose
,
dim
=
0
,
block
=
block
)
__transpose_op
(
norm
,
perm
,
out
=
out
,
block
=
block
)
return
out
def
__weight_normalize
(
g
,
v
,
dim
):
"""Calculations for weight normalization"""
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
())
scale
=
elementwise_div
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w
=
elementwise_mul
(
x
=
v
,
y
=
scale
if
dim
is
None
else
reshape
(
x
=
scale
,
shape
=
[
v
.
shape
[
dim
]]),
axis
=-
1
if
dim
is
None
else
dim
)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return
w
g_param_attr
=
copy
.
deepcopy
(
attr
)
g_param_attr
.
name
=
attr
.
name
+
'_g'
g_param_shape
=
[
1
]
*
len
(
shape
)
if
attr
.
dim
is
not
None
:
g_param_shape
[
attr
.
dim
]
=
shape
[
attr
.
dim
]
v_param_attr
=
copy
.
deepcopy
(
attr
)
v_param_attr
.
name
=
attr
.
name
+
'_v'
v_param_shape
=
shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
(
with_initializer
=
False
))
v_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
(
with_initializer
=
True
))
__norm_except_dim
(
x
=
v_param
,
out
=
g_param
,
dim
=
attr
.
dim
,
block
=
self
.
startup_program
.
global_block
())
# Add weight normalization to main_program
g_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
())
v_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
())
w_param
=
__weight_normalize
(
g_param
,
v_param
,
dim
=
attr
.
dim
)
return
w_param
# TODO: hide the func after we move the layers to Layers
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
"""Create parameters for this layers.
Args:
attr: [ParamAttr] should be the parameter attribute for this parameter
shape: shape of the paramter
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
# Deepcopy the attr so that parameters can be shared in program
attr
=
copy
.
deepcopy
(
attr
)
if
attr
is
None
:
attr
=
ParamAttr
.
_to_attr
(
attr
)
assert
isinstance
(
attr
,
ParamAttr
)
suffix
=
'b'
if
is_bias
else
'w'
if
attr
.
name
is
None
:
attr
.
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
suffix
]))
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
:
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
else
:
if
not
(
dtype
.
startswith
(
"float"
)
or
dtype
==
"double"
):
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
if
is_bias
:
attr
.
_set_default_bias_initializer
()
else
:
attr
.
_set_default_param_initializer
()
else
:
attr
.
_set_default_initializer
(
default_initializer
)
# If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if
isinstance
(
attr
,
WeightNormParamAttr
):
param
=
self
.
_create_weight_normalize
(
attr
,
shape
,
dtype
)
WeightNormParamAttr
.
params_with_weight_norm
.
append
(
param
)
return
param
if
_in_imperative_mode
():
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
else
:
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
())
def
create_variable_for_type_inference
(
self
,
dtype
,
stop_gradient
=
False
):
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return
self
.
main_program
.
current_block
().
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
'tmp'
])),
dtype
=
dtype
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
stop_gradient
)
def
create_variable
(
self
,
*
args
,
**
kwargs
):
"""Create Variable for this layers.
Returns created Variable.
"""
return
self
.
main_program
.
current_block
().
create_var
(
*
args
,
**
kwargs
)
def
create_global_variable
(
self
,
persistable
=
False
,
*
args
,
**
kwargs
):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return
self
.
main_program
.
global_block
().
create_var
(
*
args
,
persistable
=
persistable
,
**
kwargs
)
def
create_or_get_global_variable
(
self
,
name
,
*
args
,
**
kwargs
):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if
self
.
main_program
.
global_block
().
has_var
(
name
):
return
self
.
main_program
.
global_block
().
var
(
name
),
False
else
:
return
self
.
create_global_variable
(
name
=
name
,
*
args
,
**
kwargs
),
True
def
set_variable_initializer
(
self
,
var
,
initializer
):
"""Set target Variable's initializer
Args:
var: target Variable
initializer: initializer to use
"""
assert
isinstance
(
var
,
Variable
)
if
_in_imperative_mode
():
initializer
(
var
,
var
.
block
)
else
:
self
.
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
persistable
=
True
,
initializer
=
initializer
)
python/paddle/fluid/layers/control_flow.py
浏览文件 @
2c4fcaa6
...
...
@@ -848,7 +848,7 @@ def create_array(dtype):
@
templatedoc
()
def
less_than
(
x
,
y
,
force_cpu
=
None
,
cond
=
None
,
**
ignored
):
def
less_than
(
x
,
y
,
force_cpu
=
None
,
cond
=
None
):
"""
${comment}
...
...
@@ -1800,7 +1800,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
return
out
def
is_empty
(
x
,
cond
=
None
,
**
ignored
):
def
is_empty
(
x
,
cond
=
None
):
"""
Test whether a Variable is empty.
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
2c4fcaa6
...
...
@@ -51,6 +51,8 @@ __all__ = [
'yolov3_loss'
,
'box_clip'
,
'multiclass_nms'
,
'distribute_fpn_proposals'
,
'box_decoder_and_assign'
,
]
...
...
@@ -2221,3 +2223,138 @@ def multiclass_nms(bboxes,
output
.
stop_gradient
=
True
return
output
def
distribute_fpn_proposals
(
fpn_rois
,
min_level
,
max_level
,
refer_level
,
refer_scale
,
name
=
None
):
"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute all
proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order
of proposals, we return an array which indicates the original index of rois
in current proposals. To compute FPN level for each roi, the formula is
given as follows:
.. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
level = floor(&\log(
\\
frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi.
Args:
fpn_rois(variable): The input fpn_rois, the second dimension is 4.
min_level(int): The lowest level of FPN layer where the proposals come
from.
max_level(int): The highest level of FPN layer where the proposals
come from.
refer_level(int): The referring level of FPN layer with specified scale.
refer_scale(int): The referring scale of FPN layer with specified level.
name(str|None): The name of this operator.
Returns:
tuple:
A tuple(multi_rois, restore_ind) is returned. The multi_rois is
a list of segmented tensor variables. The restore_ind is a 2D
Tensor with shape [N, 1], N is the number of total rois. It is
used to restore the order of fpn_rois.
Examples:
.. code-block:: python
fpn_rois = fluid.layers.data(
name='data', shape=[4], dtype='float32', lod_level=1)
multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
fpn_rois=fpn_rois,
min_level=2,
max_level=5,
refer_level=4,
refer_scale=224)
"""
helper
=
LayerHelper
(
'distribute_fpn_proposals'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
num_lvl
=
max_level
-
min_level
+
1
multi_rois
=
[
helper
.
create_variable_for_type_inference
(
dtype
)
for
i
in
range
(
num_lvl
)
]
restore_ind
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int32'
)
helper
.
append_op
(
type
=
'distribute_fpn_proposals'
,
inputs
=
{
'FpnRois'
:
fpn_rois
},
outputs
=
{
'MultiFpnRois'
:
multi_rois
,
'RestoreIndex'
:
restore_ind
},
attrs
=
{
'min_level'
:
min_level
,
'max_level'
:
max_level
,
'refer_level'
:
refer_level
,
'refer_scale'
:
refer_scale
})
return
multi_rois
,
restore_ind
@
templatedoc
()
def
box_decoder_and_assign
(
prior_box
,
prior_box_var
,
target_box
,
box_score
,
box_clip
,
name
=
None
):
"""
${comment}
Args:
prior_box(${prior_box_type}): ${prior_box_comment}
prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
target_box(${target_box_type}): ${target_box_comment}
box_score(${box_score_type}): ${box_score_comment}
box_clip(${box_clip_type}): ${box_clip_comment}
name(str|None): The name of this operator
Returns:
decode_box(Variable), output_assign_box(Variable):
two variables:
- decode_box(${decode_box_type}): ${decode_box_comment}
- output_assign_box(${output_assign_box_type}): ${output_assign_box_comment}
Examples:
.. code-block:: python
pb = fluid.layers.data(
name='prior_box', shape=[20, 4], dtype='float32')
pbv = fluid.layers.data(
name='prior_box_var', shape=[1, 4], dtype='float32')
loc = fluid.layers.data(
name='target_box', shape=[20, 4*81], dtype='float32')
scores = fluid.layers.data(
name='scores', shape=[20, 81], dtype='float32')
decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
pb, pbv, loc, scores, 4.135)
"""
helper
=
LayerHelper
(
"box_decoder_and_assign"
,
**
locals
())
decoded_box
=
helper
.
create_variable_for_type_inference
(
dtype
=
prior_box
.
dtype
)
output_assign_box
=
helper
.
create_variable_for_type_inference
(
dtype
=
prior_box
.
dtype
)
helper
.
append_op
(
type
=
"box_decoder_and_assign"
,
inputs
=
{
"PriorBox"
:
prior_box
,
"PriorBoxVar"
:
prior_box_var
,
"TargetBox"
:
target_box
,
"BoxScore"
:
box_score
},
attrs
=
{
"box_clip"
:
box_clip
},
outputs
=
{
"DecodeBox"
:
decoded_box
,
"OutputAssignBox"
:
output_assign_box
})
return
decoded_box
,
output_assign_box
python/paddle/fluid/layers/nn.py
浏览文件 @
2c4fcaa6
...
...
@@ -4833,11 +4833,6 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
"""
def
__check_input
(
x
,
y
):
if
len
(
y
.
shape
)
>
len
(
x
.
shape
):
raise
ValueError
(
"Invalid inputs for matmul. "
"x's rank should be always greater than or equal to y'rank."
)
x_shape
=
list
(
x
.
shape
)
y_shape
=
list
(
y
.
shape
)
if
len
(
x_shape
)
==
1
:
...
...
@@ -4853,10 +4848,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
if
x_shape
[
-
1
]
!=
y_shape
[
-
2
]:
raise
ValueError
(
"Invalid inputs for matmul."
)
if
len
(
y_shape
)
>
2
:
if
len
(
y_shape
)
>
2
and
len
(
x_shape
)
>
2
:
for
i
,
dim_x
in
enumerate
(
x_shape
[:
-
2
]):
if
dim_x
!=
y_shape
[
i
]:
raise
ValueError
(
"Invalid inputs for matmul."
)
raise
ValueError
(
"Invalid inputs for matmul. x(%s), y(%s)"
%
(
x
.
shape
,
y
.
shape
))
__check_input
(
x
,
y
)
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
2c4fcaa6
...
...
@@ -142,7 +142,8 @@ def create_global_var(shape,
def
cast
(
x
,
dtype
):
"""
This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
it to the output with :attr:`dtype`.
it to the output with :attr:`dtype`. It's meaningless if the output
dtype equals the input dtype, but it's fine if you do so.
Args:
x (Variable): The input Variable for casting.
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
2c4fcaa6
...
...
@@ -379,7 +379,7 @@ class Optimizer(object):
self
.
_dtype
=
loss
.
dtype
program
=
loss
.
block
.
program
optimize_ops
=
[]
if
imperative_base
.
enabled
():
if
framework
.
_in_imperative_mode
():
if
parameter_list
is
not
None
:
parameters
=
parameter_list
else
:
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
2c4fcaa6
...
...
@@ -106,13 +106,18 @@ class ParallelExecutor(object):
else
framework
.
default_main_program
()
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
if
share_vars_from
:
assert
isinstance
(
share_vars_from
,
ParallelExecutor
),
"The share_vars_from should be ParallelExecutor."
self
.
_compiled_program
.
with_data_parallel
(
loss_name
=
loss_name
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
,
share_vars_from
=
share_vars_from
)
share_vars_from
=
share_vars_from
.
_compiled_program
if
share_vars_from
else
None
)
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_exe
cutor
=
executor
.
Executor
(
self
.
_place
)
self
.
_exe
=
executor
.
Executor
(
self
.
_place
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
...
...
@@ -180,7 +185,7 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
"""
return
self
.
_exe
cutor
.
run
(
program
=
self
.
_compiled_program
,
return
self
.
_exe
.
run
(
program
=
self
.
_compiled_program
,
scope
=
self
.
_scope
,
feed
=
feed
,
fetch_list
=
fetch_list
,
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
2c4fcaa6
...
...
@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
class
TestDistributeFpnProposals
(
unittest
.
TestCase
):
def
test_distribute_fpn_proposals
(
self
):
program
=
Program
()
with
program_guard
(
program
):
fpn_rois
=
fluid
.
layers
.
data
(
name
=
'data'
,
shape
=
[
4
],
dtype
=
'float32'
,
lod_level
=
1
)
multi_rois
,
restore_ind
=
layers
.
distribute_fpn_proposals
(
fpn_rois
=
fpn_rois
,
min_level
=
2
,
max_level
=
5
,
refer_level
=
4
,
refer_scale
=
224
)
self
.
assertIsNotNone
(
multi_rois
)
self
.
assertIsNotNone
(
restore_ind
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
浏览文件 @
2c4fcaa6
...
...
@@ -70,3 +70,17 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
fetch_list
=
[
'x@GRAD'
,
'out'
])
__assert_close
(
x_grad
,
out
[
0
],
'x@GRAD'
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
j
in
range
(
out_w
):
for
m
in
range
(
out_c
):
out_tmp
[
n
,
i
,
j
,
m
]
=
out
[
n
,
m
,
i
,
j
]
return
out_tmp
.
reshape
(
in_n
,
out_c
,
out_h
,
out_w
)
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
浏览文件 @
2c4fcaa6
...
...
@@ -20,6 +20,7 @@ import numpy as np
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_conv2d_op
import
conv2d_forward_naive
,
TestConv2dOp
from
mkldnn_op_test
import
format_reorder
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
...
...
@@ -29,20 +30,6 @@ def conv2d_forward_refer(input, filter, group, conv_param):
return
format_reorder
(
out
,
size
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
j
in
range
(
out_w
):
for
m
in
range
(
out_c
):
out_tmp
[
n
,
i
,
j
,
m
]
=
out
[
n
,
m
,
i
,
j
]
return
out_tmp
.
reshape
(
in_n
,
out_c
,
out_h
,
out_w
)
class
TestConv2dInt8Op
(
TestConv2dOp
):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
mkldnn_op_test
import
format_reorder
class
TestReQuantizeOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
'requantize'
self
.
scale_in
=
2.0
self
.
scale_out
=
1.5
self
.
input_size
=
[
1
,
1
,
5
,
5
]
self
.
data_type
=
'int8'
self
.
set_scale
()
self
.
set_data_type
()
scale_shift
=
self
.
scale_out
/
self
.
scale_in
if
self
.
data_type
==
'int8'
:
input
=
(
np
.
random
.
randint
(
0
,
100
,
self
.
input_size
)
-
50
).
astype
(
self
.
data_type
)
output_tmp
=
np
.
round
(
input
.
astype
(
'float32'
)
*
scale_shift
).
astype
(
'int8'
)
else
:
input
=
(
np
.
random
.
randint
(
0
,
100
,
self
.
input_size
)).
astype
(
self
.
data_type
)
output_tmp
=
np
.
round
(
input
.
astype
(
'float32'
)
*
scale_shift
).
astype
(
'uint8'
)
output
=
format_reorder
(
output_tmp
,
self
.
input_size
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
outputs
=
{
'Output'
:
output
}
self
.
attrs
=
{
'Scale_in'
:
self
.
scale_in
,
'Scale_out'
:
self
.
scale_out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
set_scale
(
self
):
pass
def
set_data_type
(
OpTest
):
pass
#--------------------test requantize with s8 input--------------------
class
TestReQuantizeOp1
(
TestReQuantizeOp
):
def
set_scale
(
self
):
self
.
scale_in
=
1.5
self
.
scale_out
=
1.5
class
TestReQuantizeOp2
(
TestReQuantizeOp
):
def
set_scale
(
self
):
self
.
scale_in
=
0.1
self
.
scale_out
=
0.2
#--------------------test requantize with u8 input--------------------
class
TestReQuantizeOp3
(
TestReQuantizeOp1
):
def
set_data_type
(
self
):
self
.
data_type
=
'uint8'
class
TestReQuantizeOp4
(
TestReQuantizeOp2
):
def
set_data_type
(
self
):
self
.
data_type
=
'uint8'
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_base_layer.py
浏览文件 @
2c4fcaa6
...
...
@@ -16,27 +16,17 @@ import unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.layer_helper
import
LayerHelper
class
L1
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
prefix
):
super
(
L1
,
self
).
__init__
(
prefix
)
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
w1
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
self
.
w2
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
self
.
_param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
))
self
.
w1
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
self
.
w2
=
self
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
def
forward
(
self
):
return
self
.
w1
+
self
.
w2
...
...
@@ -67,8 +57,8 @@ class TestBaseLayer(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
l
=
L1
(
'test_one_level'
)
ret
=
l
()
self
.
assertEqual
(
l
.
w1
.
name
,
"test_one_level/L1_0
_0
.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"test_one_level/L1_0
_0
.w_1"
)
self
.
assertEqual
(
l
.
w1
.
name
,
"test_one_level/L1_0.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"test_one_level/L1_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
def
test_three_level
(
self
):
...
...
@@ -76,12 +66,12 @@ class TestBaseLayer(unittest.TestCase):
l
=
L3
(
'test_three_level'
)
names
=
[
p
.
name
for
p
in
l
.
parameters
()]
ret
=
l
()
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0
_0
.w_0"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0
_0
.w_1"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1
_0
.w_0"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1
_0
.w_1"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0
_0
.w_0"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0
_0
.w_1"
)
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0.w_0"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0.w_1"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1.w_0"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1.w_1"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0.w_0"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
...
...
python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
sys
import
math
from
op_test
import
OpTest
def
box_decoder_and_assign
(
deltas
,
weights
,
boxes
,
box_score
,
box_clip
):
boxes
=
boxes
.
astype
(
deltas
.
dtype
,
copy
=
False
)
widths
=
boxes
[:,
2
]
-
boxes
[:,
0
]
+
1.0
heights
=
boxes
[:,
3
]
-
boxes
[:,
1
]
+
1.0
ctr_x
=
boxes
[:,
0
]
+
0.5
*
widths
ctr_y
=
boxes
[:,
1
]
+
0.5
*
heights
wx
,
wy
,
ww
,
wh
=
weights
dx
=
deltas
[:,
0
::
4
]
*
wx
dy
=
deltas
[:,
1
::
4
]
*
wy
dw
=
deltas
[:,
2
::
4
]
*
ww
dh
=
deltas
[:,
3
::
4
]
*
wh
# Prevent sending too large values into np.exp()
dw
=
np
.
minimum
(
dw
,
box_clip
)
dh
=
np
.
minimum
(
dh
,
box_clip
)
pred_ctr_x
=
dx
*
widths
[:,
np
.
newaxis
]
+
ctr_x
[:,
np
.
newaxis
]
pred_ctr_y
=
dy
*
heights
[:,
np
.
newaxis
]
+
ctr_y
[:,
np
.
newaxis
]
pred_w
=
np
.
exp
(
dw
)
*
widths
[:,
np
.
newaxis
]
pred_h
=
np
.
exp
(
dh
)
*
heights
[:,
np
.
newaxis
]
pred_boxes
=
np
.
zeros
(
deltas
.
shape
,
dtype
=
deltas
.
dtype
)
# x1
pred_boxes
[:,
0
::
4
]
=
pred_ctr_x
-
0.5
*
pred_w
# y1
pred_boxes
[:,
1
::
4
]
=
pred_ctr_y
-
0.5
*
pred_h
# x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
pred_boxes
[:,
2
::
4
]
=
pred_ctr_x
+
0.5
*
pred_w
-
1
# y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
pred_boxes
[:,
3
::
4
]
=
pred_ctr_y
+
0.5
*
pred_h
-
1
output_assign_box
=
[]
for
ino
in
range
(
len
(
pred_boxes
)):
rank
=
np
.
argsort
(
-
box_score
[
ino
])
maxidx
=
rank
[
0
]
if
maxidx
==
0
:
maxidx
=
rank
[
1
]
beg_pos
=
maxidx
*
4
end_pos
=
maxidx
*
4
+
4
output_assign_box
.
append
(
pred_boxes
[
ino
,
beg_pos
:
end_pos
])
output_assign_box
=
np
.
array
(
output_assign_box
)
return
pred_boxes
,
output_assign_box
class
TestBoxDecoderAndAssignOpWithLoD
(
OpTest
):
def
test_check_output
(
self
):
self
.
check_output
()
def
setUp
(
self
):
self
.
op_type
=
"box_decoder_and_assign"
lod
=
[[
4
,
8
,
8
]]
num_classes
=
10
prior_box
=
np
.
random
.
random
((
20
,
4
)).
astype
(
'float32'
)
prior_box_var
=
np
.
array
([
0.1
,
0.1
,
0.2
,
0.2
],
dtype
=
np
.
float32
)
target_box
=
np
.
random
.
random
((
20
,
4
*
num_classes
)).
astype
(
'float32'
)
box_score
=
np
.
random
.
random
((
20
,
num_classes
)).
astype
(
'float32'
)
box_clip
=
4.135
output_box
,
output_assign_box
=
box_decoder_and_assign
(
target_box
,
prior_box_var
,
prior_box
,
box_score
,
box_clip
)
self
.
inputs
=
{
'PriorBox'
:
(
prior_box
,
lod
),
'PriorBoxVar'
:
prior_box_var
,
'TargetBox'
:
(
target_box
,
lod
),
'BoxScore'
:
(
box_score
,
lod
),
}
self
.
attrs
=
{
'box_clip'
:
box_clip
}
self
.
outputs
=
{
'DecodeBox'
:
output_box
,
'OutputAssignBox'
:
output_assign_box
}
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistMnistNCCL2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_nccl2_mode
=
True
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1
,
need_envs
=
{
"FLAGS_enable_parallel_graph"
:
"1"
,
"FLAGS_sync_nccl_allreduce"
:
"1"
})
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
math
import
sys
from
op_test
import
OpTest
class
TestDistributeFPNProposalsOp
(
OpTest
):
def
set_data
(
self
):
self
.
init_test_case
()
self
.
make_rois
()
self
.
rois_fpn
,
self
.
rois_idx_restore
=
self
.
calc_rois_distribute
()
self
.
inputs
=
{
'FpnRois'
:
(
self
.
rois
[:,
1
:
5
],
self
.
rois_lod
)}
self
.
attrs
=
{
'max_level'
:
self
.
roi_max_level
,
'min_level'
:
self
.
roi_min_level
,
'refer_scale'
:
self
.
canonical_scale
,
'refer_level'
:
self
.
canonical_level
}
output
=
[(
'out%d'
%
i
,
self
.
rois_fpn
[
i
])
for
i
in
range
(
len
(
self
.
rois_fpn
))]
self
.
outputs
=
{
'MultiFpnRois'
:
output
,
'RestoreIndex'
:
self
.
rois_idx_restore
}
def
init_test_case
(
self
):
self
.
roi_max_level
=
5
self
.
roi_min_level
=
2
self
.
canonical_scale
=
224
self
.
canonical_level
=
4
self
.
images_shape
=
[
512
,
512
]
def
boxes_area
(
self
,
boxes
):
w
=
(
boxes
[:,
2
]
-
boxes
[:,
0
]
+
1
)
h
=
(
boxes
[:,
3
]
-
boxes
[:,
1
]
+
1
)
areas
=
w
*
h
assert
np
.
all
(
areas
>=
0
),
'Negative areas founds'
return
areas
def
map_rois_to_fpn_levels
(
self
,
rois
,
lvl_min
,
lvl_max
):
s
=
np
.
sqrt
(
self
.
boxes_area
(
rois
))
s0
=
self
.
canonical_scale
lvl0
=
self
.
canonical_level
target_lvls
=
np
.
floor
(
lvl0
+
np
.
log2
(
s
/
s0
+
1e-6
))
target_lvls
=
np
.
clip
(
target_lvls
,
lvl_min
,
lvl_max
)
return
target_lvls
def
get_sub_lod
(
self
,
sub_lvl
):
sub_lod
=
[]
max_batch_id
=
sub_lvl
[
-
1
]
for
i
in
range
(
max_batch_id
.
astype
(
np
.
int32
)
+
1
):
sub_lod
.
append
(
np
.
where
(
sub_lvl
==
i
)[
0
].
size
)
return
sub_lod
def
add_multilevel_roi
(
self
,
rois
,
target_lvls
,
lvl_min
,
lvl_max
):
rois_idx_order
=
np
.
empty
((
0
,
))
rois_fpn
=
[]
for
lvl
in
range
(
lvl_min
,
lvl_max
+
1
):
idx_lvl
=
np
.
where
(
target_lvls
==
lvl
)[
0
]
if
len
(
idx_lvl
)
==
0
:
rois_fpn
.
append
((
np
.
empty
(
shape
=
(
0
,
4
)),
[[
0
,
0
]]))
continue
sub_lod
=
self
.
get_sub_lod
(
rois
[
idx_lvl
,
0
])
rois_fpn
.
append
((
rois
[
idx_lvl
,
1
:],
[
sub_lod
]))
rois_idx_order
=
np
.
concatenate
((
rois_idx_order
,
idx_lvl
))
rois_idx_restore
=
np
.
argsort
(
rois_idx_order
).
astype
(
np
.
int32
,
copy
=
False
)
return
rois_fpn
,
rois_idx_restore
def
calc_rois_distribute
(
self
):
lvl_min
=
self
.
roi_min_level
lvl_max
=
self
.
roi_max_level
target_lvls
=
self
.
map_rois_to_fpn_levels
(
self
.
rois
[:,
1
:
5
],
lvl_min
,
lvl_max
)
rois_fpn
,
rois_idx_restore
=
self
.
add_multilevel_roi
(
self
.
rois
,
target_lvls
,
lvl_min
,
lvl_max
)
return
rois_fpn
,
rois_idx_restore
def
make_rois
(
self
):
self
.
rois_lod
=
[[
100
,
200
]]
rois
=
[]
lod
=
self
.
rois_lod
[
0
]
bno
=
0
for
roi_num
in
lod
:
for
i
in
range
(
roi_num
):
xywh
=
np
.
random
.
rand
(
4
)
xy1
=
xywh
[
0
:
2
]
*
20
wh
=
xywh
[
2
:
4
]
*
(
self
.
images_shape
-
xy1
)
xy2
=
xy1
+
wh
roi
=
[
bno
,
xy1
[
0
],
xy1
[
1
],
xy2
[
0
],
xy2
[
1
]]
rois
.
append
(
roi
)
bno
+=
1
self
.
rois
=
np
.
array
(
rois
).
astype
(
"float32"
)
def
setUp
(
self
):
self
.
op_type
=
"distribute_fpn_proposals"
self
.
set_data
()
def
test_check_output
(
self
):
self
.
check_output
()
python/paddle/fluid/tests/unittests/test_imperative_basic.py
浏览文件 @
2c4fcaa6
...
...
@@ -53,11 +53,15 @@ class MLP(fluid.imperative.Layer):
super
(
MLP
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
,
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc2
=
FC
(
self
.
full_name
(),
4
,
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
def
forward
(
self
,
inputs
):
...
...
@@ -74,41 +78,37 @@ class SimpleRNNCell(fluid.imperative.Layer):
self
.
step_input_size
=
step_input_size
self
.
hidden_size
=
hidden_size
self
.
output_size
=
output_size
self
.
_dype
=
core
.
VarDesc
.
VarType
.
FP32
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'SimpleRNNCell'
,
act
=
"tanh"
,
param_attr
=
param_attr
)
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
self
.
param_attr
=
param_attr
def
_build_once
(
self
,
inputs
,
pre_hidden
):
i2h_param_shape
=
[
self
.
step_input_size
,
self
.
hidden_size
]
h2h_param_shape
=
[
self
.
hidden_size
,
self
.
hidden_size
]
h2o_param_shape
=
[
self
.
output_size
,
self
.
hidden_size
]
self
.
_i2h_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
self
.
_i2h_w
=
self
.
create_parameter
(
attr
=
self
.
param_attr
,
shape
=
i2h_param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
self
.
_h2h_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
self
.
_h2h_w
=
self
.
create_parameter
(
attr
=
self
.
param_attr
,
shape
=
h2h_param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
self
.
_h2o_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
self
.
_h2o_w
=
self
.
create_parameter
(
attr
=
self
.
param_attr
,
shape
=
h2o_param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
def
forward
(
self
,
input
,
pre_hidden
):
tmp_i2h
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
tmp_h2h
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
hidden
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dype
)
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dype
)
softmax_out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
reduce_out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
tmp_i2h
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
tmp_h2h
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
hidden
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
softmax_out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
reduce_out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
self
.
_helper
.
append_op
(
type
=
"mul"
,
inputs
=
{
"X"
:
input
,
...
...
@@ -132,7 +132,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
outputs
=
{
'Out'
:
hidden
},
attrs
=
{
'axis'
:
-
1
,
'use_mkldnn'
:
False
})
hidden
=
self
.
_helper
.
append_activation
(
hidden
)
hidden
=
self
.
_helper
.
append_activation
(
hidden
,
act
=
'tanh'
)
self
.
_helper
.
append_op
(
type
=
"mul"
,
...
...
@@ -174,7 +174,7 @@ class SimpleRNN(fluid.imperative.Layer):
outs
=
list
()
pre_hiddens
=
list
()
init_hidden
=
fluid
.
layers
.
tensor
.
create_parameter
(
init_hidden
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
shape
=
[
1
,
3
],
...
...
@@ -337,10 +337,10 @@ class TestImperative(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
params
=
mlp
.
parameters
(
True
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0
_0
.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0
_0
.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1
_0
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1
_0
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
len
(
params
),
4
)
sublayers
=
mlp
.
sublayers
(
True
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
2c4fcaa6
...
...
@@ -78,7 +78,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
class
MNIST
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
param_attr
=
None
,
bias_attr
=
None
):
def
__init__
(
self
,
name_scope
):
super
(
MNIST
,
self
).
__init__
(
name_scope
)
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
...
...
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
浏览文件 @
2c4fcaa6
...
...
@@ -41,19 +41,17 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self
.
_dropout
=
dropout
self
.
_input
=
None
self
.
_num_steps
=
num_steps
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'SimpleLSTMRNN'
,
act
=
"tanh"
)
self
.
cell_array
=
[]
self
.
hidden_array
=
[]
def
_build_once
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
self
.
weight_1_arr
=
[]
self
.
weight_2_arr
=
[]
self
.
bias_arr
=
[]
self
.
hidden_array
=
[]
self
.
cell_array
=
[]
self
.
mask_array
=
[]
for
i
in
range
(
self
.
_num_layers
):
weight_1
=
self
.
_helper
.
create_parameter
(
weight_1
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
...
...
@@ -62,7 +60,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
))
self
.
weight_1_arr
.
append
(
weight_1
)
bias_1
=
self
.
_helper
.
create_parameter
(
bias_1
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
...
...
@@ -71,6 +69,11 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
default_initializer
=
fluid
.
initializer
.
Constant
(
0.0
))
self
.
bias_arr
.
append
(
bias_1
)
def
forward
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
self
.
cell_array
=
[]
self
.
hidden_array
=
[]
for
i
in
range
(
self
.
_num_layers
):
pre_hidden
=
fluid
.
layers
.
slice
(
init_hidden
,
axes
=
[
0
],
starts
=
[
i
],
ends
=
[
i
+
1
])
pre_cell
=
fluid
.
layers
.
slice
(
...
...
@@ -82,7 +85,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self
.
hidden_array
.
append
(
pre_hidden
)
self
.
cell_array
.
append
(
pre_cell
)
def
forward
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
res
=
[]
for
index
in
range
(
self
.
_num_steps
):
self
.
_input
=
fluid
.
layers
.
slice
(
...
...
@@ -145,8 +147,6 @@ class PtbModel(fluid.imperative.Layer):
self
.
num_layers
=
num_layers
self
.
num_steps
=
num_steps
self
.
dropout
=
dropout
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'PtbModel'
,
act
=
"tanh"
)
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
full_name
(),
hidden_size
,
...
...
@@ -163,13 +163,13 @@ class PtbModel(fluid.imperative.Layer):
name
=
'embedding_para'
,
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
init_scale
,
high
=
init_scale
)))
self
.
softmax_weight
=
self
.
_helper
.
create_parameter
(
self
.
softmax_weight
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(),
shape
=
[
self
.
hidden_size
,
self
.
vocab_size
],
dtype
=
"float32"
,
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
init_scale
,
high
=
self
.
init_scale
))
self
.
softmax_bias
=
self
.
_helper
.
create_parameter
(
self
.
softmax_bias
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(),
shape
=
[
self
.
vocab_size
],
dtype
=
"float32"
,
...
...
@@ -180,7 +180,6 @@ class PtbModel(fluid.imperative.Layer):
pass
def
forward
(
self
,
input
,
label
,
init_hidden
,
init_cell
):
init_h
=
fluid
.
layers
.
reshape
(
init_hidden
,
shape
=
[
self
.
num_layers
,
-
1
,
self
.
hidden_size
])
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录