Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
2c4fcaa6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2c4fcaa6
编写于
3月 07, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
上级
2a639d5c
40f1dd81
变更
125
隐藏空白更改
内联
并排
Showing
125 changed file
with
4345 addition
and
1337 deletion
+4345
-1337
Dockerfile
Dockerfile
+3
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-3
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+7
-5
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+7
-0
paddle/fluid/framework/inlined_stack.h
paddle/fluid/framework/inlined_stack.h
+2
-3
paddle/fluid/framework/ir/fuse_pass_base.h
paddle/fluid/framework/ir/fuse_pass_base.h
+5
-0
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+7
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+0
-41
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+2
-78
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-0
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+31
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+3
-0
paddle/fluid/inference/analysis/ir_pass_manager.h
paddle/fluid/inference/analysis/ir_pass_manager.h
+3
-0
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+193
-74
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
...uid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+9
-3
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+11
-0
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
...rence/analysis/passes/ir_params_sync_among_devices_pass.h
+1
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+3
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+35
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+9
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+3
-0
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+58
-2
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+1
-1
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+5
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+3
-1
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+21
-1
paddle/fluid/inference/engine.h
paddle/fluid/inference/engine.h
+0
-5
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+2
-19
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+1
-2
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+2
-2
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+62
-0
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+8
-11
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+51
-34
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+12
-131
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+39
-52
paddle/fluid/inference/tensorrt/helper.h
paddle/fluid/inference/tensorrt/helper.h
+29
-0
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+2
-1
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+7
-0
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+9
-5
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
.../fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+9
-2
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
...e/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+13
-7
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+14
-1
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+30
-13
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+6
-0
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+6
-3
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+9
-1
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+48
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+78
-0
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+8
-1
paddle/fluid/inference/tensorrt/test_engine.cc
paddle/fluid/inference/tensorrt/test_engine.cc
+92
-42
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+6
-4
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+6
-4
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+8
-4
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+2
-1
paddle/fluid/memory/allocation/allocator.cc
paddle/fluid/memory/allocation/allocator.cc
+2
-5
paddle/fluid/memory/allocation/allocator.h
paddle/fluid/memory/allocation/allocator.h
+3
-3
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+49
-32
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
...e/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+123
-13
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
...le/fluid/memory/allocation/multi_bin_buffered_allocator.h
+1
-1
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+39
-36
paddle/fluid/memory/detail/buddy_allocator.h
paddle/fluid/memory/detail/buddy_allocator.h
+8
-3
paddle/fluid/memory/detail/memory_block.h
paddle/fluid/memory/detail/memory_block.h
+5
-4
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+190
-17
paddle/fluid/operators/benchmark/op_tester.h
paddle/fluid/operators/benchmark/op_tester.h
+9
-2
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+58
-20
paddle/fluid/operators/benchmark/op_tester_config.h
paddle/fluid/operators/benchmark/op_tester_config.h
+22
-0
paddle/fluid/operators/cast_op.cc
paddle/fluid/operators/cast_op.cc
+3
-1
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+3
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
...le/fluid/operators/detection/box_decoder_and_assign_op.cc
+169
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
...le/fluid/operators/detection/box_decoder_and_assign_op.cu
+147
-0
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+103
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
.../fluid/operators/detection/distribute_fpn_proposals_op.cc
+93
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
.../fluid/operators/detection/distribute_fpn_proposals_op.cu
+221
-0
paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
...e/fluid/operators/detection/distribute_fpn_proposals_op.h
+147
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+10
-13
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+23
-0
paddle/fluid/operators/jit/gen/CMakeLists.txt
paddle/fluid/operators/jit/gen/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+91
-0
paddle/fluid/operators/jit/gen/vbroadcast.h
paddle/fluid/operators/jit/gen/vbroadcast.h
+53
-0
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+2
-0
paddle/fluid/operators/jit/kernel_base.h
paddle/fluid/operators/jit/kernel_base.h
+9
-0
paddle/fluid/operators/jit/kernel_key.cc
paddle/fluid/operators/jit/kernel_key.cc
+5
-0
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+2
-0
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+18
-0
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+10
-0
paddle/fluid/operators/jit/refer/CMakeLists.txt
paddle/fluid/operators/jit/refer/CMakeLists.txt
+2
-0
paddle/fluid/operators/jit/refer/refer.cc
paddle/fluid/operators/jit/refer/refer.cc
+3
-0
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+17
-0
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+67
-24
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+94
-0
paddle/fluid/operators/recurrent_op.cc
paddle/fluid/operators/recurrent_op.cc
+29
-16
paddle/fluid/operators/requantize_op.cc
paddle/fluid/operators/requantize_op.cc
+46
-0
paddle/fluid/operators/requantize_op.h
paddle/fluid/operators/requantize_op.h
+47
-0
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+4
-1
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+3
-0
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+63
-116
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+2
-0
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+58
-1
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+6
-0
paddle/fluid/platform/temporary_allocator.cc
paddle/fluid/platform/temporary_allocator.cc
+1
-0
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+2
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/imperative/layer_object_helper.py
python/paddle/fluid/imperative/layer_object_helper.py
+220
-0
python/paddle/fluid/imperative/layers.py
python/paddle/fluid/imperative/layers.py
+48
-1
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+32
-53
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+8
-9
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+14
-309
python/paddle/fluid/layer_helper_base.py
python/paddle/fluid/layer_helper_base.py
+381
-0
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+2
-2
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+137
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+3
-7
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+2
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+12
-7
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+16
-0
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+14
-0
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
...luid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+1
-14
python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+93
-0
python/paddle/fluid/tests/unittests/test_base_layer.py
python/paddle/fluid/tests/unittests/test_base_layer.py
+14
-24
python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
...e/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+96
-0
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+40
-0
python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
...fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+117
-0
python/paddle/fluid/tests/unittests/test_imperative_basic.py
python/paddle/fluid/tests/unittests/test_imperative_basic.py
+26
-26
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
...n/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+11
-12
未找到文件。
Dockerfile
浏览文件 @
2c4fcaa6
...
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
...
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN
wget
-qO-
http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz |
\
tar
-xz
-C
/usr/local
&&
\
RUN
wget
-q
https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz
--no-check-certificate
&&
\
tar
-zxf
TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz
-C
/usr/local
&&
\
cp
-rf
/usr/local/TensorRT/include /usr
&&
\
cp
-rf
/usr/local/TensorRT/include /usr
&&
\
cp
-rf
/usr/local/TensorRT/lib /usr
cp
-rf
/usr/local/TensorRT/lib /usr
...
...
paddle/fluid/API.spec
浏览文件 @
2c4fcaa6
...
@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
...
@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '
60cb8f843d625abf33f8bf12455b8f99
'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '
992eb42590fc1c380841a6db72ce78b3
'))
paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
...
@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
...
@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=
'ignored'
, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=
None
, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
...
@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
...
@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=
'ignored'
, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=
None
, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
...
@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
...
@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
...
...
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
2c4fcaa6
...
@@ -20,6 +20,9 @@
...
@@ -20,6 +20,9 @@
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {
...
@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
// valid the node is a var node
// valid the node is a var node
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
()
||
node
->
Name
()
==
kEmptyVarName
)
return
false
;
bool
flag
=
true
;
bool
flag
=
true
;
// op output force generated in cpu, can not be reused.
// op output force generated in cpu, can not be reused.
...
@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
...
@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
if
(
shape
.
empty
()
||
size
<
MinChunkSize
())
{
if
(
shape
.
empty
()
||
size
<
MinChunkSize
())
{
return
false
;
return
false
;
}
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std
::
string
name
=
node
.
Name
();
if
(
!
name
.
empty
()
&&
name
[
0
]
==
'@'
&&
name
[
name
.
size
()
-
1
]
==
'@'
)
return
false
;
return
true
;
return
true
;
}
}
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
2c4fcaa6
...
@@ -13,6 +13,8 @@
...
@@ -13,6 +13,8 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
...
@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
auto
&
g
=
graphs
.
back
();
auto
&
g
=
graphs
.
back
();
g
->
Set
(
kGraphVars
,
new
GraphVars
(
1UL
));
g
->
Set
(
kGraphVars
,
new
GraphVars
(
1UL
));
g
->
Set
(
kGraphDepVars
,
new
GraphDepVars
);
g
->
Set
(
kGraphDepVars
,
new
GraphDepVars
);
auto
&
stale_ops
=
graph
->
Get
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
);
g
->
Erase
(
details
::
kStaleProgramOpDescs
);
g
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
details
::
kStaleProgramOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
stale_ops
));
}
}
auto
op_handles
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
auto
op_handles
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
...
...
paddle/fluid/framework/
small
_stack.h
→
paddle/fluid/framework/
inlined
_stack.h
浏览文件 @
2c4fcaa6
...
@@ -14,7 +14,6 @@
...
@@ -14,7 +14,6 @@
#pragma once
#pragma once
#include <array>
#include <deque>
#include <deque>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -22,7 +21,7 @@ namespace paddle {
...
@@ -22,7 +21,7 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
template
<
typename
T
,
size_t
N
>
template
<
typename
T
,
size_t
N
>
class
Small
Stack
{
class
Inlined
Stack
{
static_assert
(
N
>
0
,
"N must be larger than 0"
);
static_assert
(
N
>
0
,
"N must be larger than 0"
);
public:
public:
...
@@ -66,8 +65,8 @@ class SmallStack {
...
@@ -66,8 +65,8 @@ class SmallStack {
private:
private:
T
head_
[
N
];
T
head_
[
N
];
size_t
size_
{
0
};
std
::
deque
<
T
>
tail_
;
std
::
deque
<
T
>
tail_
;
size_t
size_
;
};
};
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/ir/fuse_pass_base.h
浏览文件 @
2c4fcaa6
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#pragma once
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
...
@@ -24,6 +25,10 @@ namespace ir {
...
@@ -24,6 +25,10 @@ namespace ir {
static
const
char
kParamScopeAttr
[]
=
"__param_scope__"
;
static
const
char
kParamScopeAttr
[]
=
"__param_scope__"
;
static
const
char
kFuseStatisAttr
[]
=
"__fuse_statis__"
;
static
const
char
kFuseStatisAttr
[]
=
"__fuse_statis__"
;
// When we use trt or other third_party lib, the parameters are managed by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static
const
char
kRepetitiveParamAttr
[]
=
"__repetitive_param__"
;
enum
FuseOptions
{
enum
FuseOptions
{
DO_NOT_FUSE
,
// fusing will not be done
DO_NOT_FUSE
,
// fusing will not be done
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
2c4fcaa6
...
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
...
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
}
std
::
vector
<
ir
::
Node
*>
nodes
;
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
nodes
.
push_back
(
adj_n
);
}
}
}
}
std
::
sort
(
nodes
.
begin
(),
nodes
.
end
(),
[](
ir
::
Node
*
node1
,
ir
::
Node
*
node2
)
{
return
node1
->
id
()
>
node2
->
id
();
});
adj_list
[
n
].
insert
(
std
::
make_move_iterator
(
nodes
.
begin
()),
std
::
make_move_iterator
(
nodes
.
end
()));
}
}
return
adj_list
;
return
adj_list
;
}
}
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
2c4fcaa6
...
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
...
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
}
const
Variable
*
ExecutionContext
::
LegacyInputVar
(
const
std
::
string
&
name
)
const
{
auto
ipt
=
op_
.
Input
(
name
);
return
ipt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
ipt
);
}
Variable
*
ExecutionContext
::
OutputVar
(
const
std
::
string
&
name
)
const
{
Variable
*
ExecutionContext
::
OutputVar
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
outputs
.
find
(
name
);
auto
it
=
ctx_
.
outputs
.
find
(
name
);
if
(
it
==
ctx_
.
outputs
.
end
())
return
nullptr
;
if
(
it
==
ctx_
.
outputs
.
end
())
return
nullptr
;
...
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
...
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
}
Variable
*
ExecutionContext
::
LegacyOutputVar
(
const
std
::
string
&
name
)
const
{
auto
opt
=
op_
.
Output
(
name
);
return
opt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
opt
);
}
template
<
>
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
Input
<
LoDTensor
>
(
name
);
return
Input
<
LoDTensor
>
(
name
);
}
}
template
<
>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyInput
<
LoDTensor
>
(
name
);
}
template
<
>
template
<
>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
const
std
::
string
&
name
)
const
{
...
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
...
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
return
res
;
return
res
;
}
}
template
<
>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
LegacyMultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
auto
names
=
op
().
Inputs
(
name
);
std
::
vector
<
const
Tensor
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
const
Tensor
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
if
(
var
==
nullptr
)
return
nullptr
;
PADDLE_ENFORCE
(
var
->
IsType
<
LoDTensor
>
(),
"%s should be LoDTensor, but the received type is %s"
,
sub_name
,
ToTypeName
(
var
->
Type
()));
return
&
(
var
->
Get
<
LoDTensor
>
());
});
return
res
;
}
template
<
>
template
<
>
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
Output
<
LoDTensor
>
(
name
);
return
Output
<
LoDTensor
>
(
name
);
}
}
template
<
>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyOutput
<
LoDTensor
>
(
name
);
}
template
<
>
template
<
>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
const
std
::
string
&
name
)
const
{
...
...
paddle/fluid/framework/operator.h
浏览文件 @
2c4fcaa6
...
@@ -16,9 +16,11 @@ limitations under the License. */
...
@@ -16,9 +16,11 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <atomic>
#include <atomic>
#include <memory>
#include <string>
#include <string>
#include <tuple>
#include <tuple>
#include <unordered_map>
#include <unordered_map>
#include <utility>
#include <vector>
#include <vector>
#include "glog/logging.h" // For VLOG
#include "glog/logging.h" // For VLOG
...
@@ -253,31 +255,6 @@ class ExecutionContext {
...
@@ -253,31 +255,6 @@ class ExecutionContext {
return
it
->
second
;
return
it
->
second
;
}
}
const
std
::
vector
<
Variable
*>
LegacyMultiInputVar
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Inputs
(
name
);
std
::
vector
<
Variable
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
this
](
const
std
::
string
&
name
)
{
return
name
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
name
);
});
return
res
;
}
std
::
vector
<
Variable
*>
LegacyMultiOutputVar
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Outputs
(
name
);
std
::
vector
<
Variable
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
this
](
const
std
::
string
&
name
)
{
return
name
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
name
);
});
return
res
;
}
template
<
typename
T
>
template
<
typename
T
>
const
T
*
Input
(
const
std
::
string
&
name
)
const
{
const
T
*
Input
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
InputVar
(
name
);
auto
*
var
=
InputVar
(
name
);
...
@@ -290,22 +267,6 @@ class ExecutionContext {
...
@@ -290,22 +267,6 @@ class ExecutionContext {
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
}
template
<
typename
T
>
const
T
*
LegacyInput
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
LegacyInputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
&
var
->
Get
<
T
>
();
}
template
<
typename
T
>
T
*
LegacyOutput
(
const
std
::
string
&
name
)
const
{
auto
var
=
LegacyOutputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
const
Variable
*
LegacyInputVar
(
const
std
::
string
&
name
)
const
;
Variable
*
LegacyOutputVar
(
const
std
::
string
&
name
)
const
;
template
<
typename
T
>
template
<
typename
T
>
const
std
::
vector
<
const
T
*>
MultiInput
(
const
std
::
string
&
name
)
const
{
const
std
::
vector
<
const
T
*>
MultiInput
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
inputs
.
find
(
name
);
auto
it
=
ctx_
.
inputs
.
find
(
name
);
...
@@ -338,32 +299,6 @@ class ExecutionContext {
...
@@ -338,32 +299,6 @@ class ExecutionContext {
return
res
;
return
res
;
}
}
template
<
typename
T
>
const
std
::
vector
<
const
T
*>
LegacyMultiInput
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Inputs
(
name
);
std
::
vector
<
const
T
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
const
T
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
&
var
->
Get
<
T
>
();
});
return
res
;
}
template
<
typename
T
>
std
::
vector
<
T
*>
LegacyMultiOutput
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Outputs
(
name
);
std
::
vector
<
T
*>
res
;
res
.
reserve
(
names
.
size
());
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
->
T
*
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
});
return
res
;
}
platform
::
Place
GetPlace
()
const
{
return
device_context_
.
GetPlace
();
}
platform
::
Place
GetPlace
()
const
{
return
device_context_
.
GetPlace
();
}
template
<
typename
DeviceContextType
>
template
<
typename
DeviceContextType
>
...
@@ -433,24 +368,13 @@ class ExecutionContext {
...
@@ -433,24 +368,13 @@ class ExecutionContext {
template
<>
template
<>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
template
<>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
const
std
::
string
&
name
)
const
;
template
<>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
LegacyMultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
template
<>
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
template
<>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
const
std
::
string
&
name
)
const
;
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
2c4fcaa6
...
@@ -23,8 +23,12 @@
...
@@ -23,8 +23,12 @@
#pragma once
#pragma once
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
...
@@ -133,6 +137,8 @@ struct Argument {
...
@@ -133,6 +137,8 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_precision_mode
,
TensorRtPrecisionMode
,
DECL_ARGUMENT_FIELD
(
tensorrt_precision_mode
,
TensorRtPrecisionMode
,
AnalysisConfig
::
Precision
);
AnalysisConfig
::
Precision
);
DECL_ARGUMENT_FIELD
(
tensorrt_use_static_engine
,
TensorRtUseStaticEngine
,
bool
);
// Memory optimized related.
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
2c4fcaa6
...
@@ -17,10 +17,12 @@ limitations under the License. */
...
@@ -17,10 +17,12 @@ limitations under the License. */
#include <sys/stat.h>
#include <sys/stat.h>
#include <cstdio>
#include <cstdio>
#include <fstream>
#include <fstream>
#include <memory>
#include <set>
#include <set>
#include <string>
#include <string>
#include <typeindex>
#include <typeindex>
#include <unordered_map>
#include <unordered_map>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/framework.pb.h"
...
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
...
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
return
""
;
return
""
;
}
}
static
std
::
string
GetTrtEngineSerializedPath
(
const
std
::
string
&
model_root
,
const
std
::
string
&
engine_key
)
{
return
model_root
+
"/trt_serialized_"
+
engine_key
;
}
static
std
::
string
GetTrtEngineSerializedData
(
const
std
::
string
&
model_opt_cache_dir
,
const
std
::
string
&
engine_key
)
{
std
::
string
trt_serialized_path
=
GetTrtEngineSerializedPath
(
model_opt_cache_dir
,
engine_key
);
if
(
FileExists
(
trt_serialized_path
))
{
VLOG
(
3
)
<<
"Trt serialized file: "
<<
trt_serialized_path
<<
"is found here"
;
std
::
ifstream
infile
(
trt_serialized_path
,
std
::
ios
::
in
);
std
::
stringstream
buffer
;
buffer
<<
infile
.
rdbuf
();
std
::
string
trt_engine_serialized_data
(
buffer
.
str
());
return
trt_engine_serialized_data
;
}
return
""
;
}
static
void
SaveTrtEngineSerializedDataToFile
(
const
std
::
string
&
trt_serialized_path
,
const
std
::
string
&
engine_serialized_data
)
{
std
::
ofstream
outfile
(
trt_serialized_path
);
outfile
<<
engine_serialized_data
;
outfile
.
close
();
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
2c4fcaa6
...
@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
pass
->
Set
(
"model_opt_cache_dir"
,
"model_opt_cache_dir"
,
new
std
::
string
(
GetOrCreateModelOptCacheDir
(
model_opt_cache_dir
)));
new
std
::
string
(
GetOrCreateModelOptCacheDir
(
model_opt_cache_dir
)));
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"use_static_engine"
,
new
bool
(
argument
->
tensorrt_use_static_engine
()));
}
}
pre_pass
=
pass_name
;
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/analysis/ir_pass_manager.h
浏览文件 @
2c4fcaa6
...
@@ -22,7 +22,10 @@
...
@@ -22,7 +22,10 @@
#pragma once
#pragma once
#include <memory>
#include <string>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
2c4fcaa6
...
@@ -14,13 +14,13 @@
...
@@ -14,13 +14,13 @@
#include <algorithm>
#include <algorithm>
#include <set>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/pretty_log.h"
...
@@ -33,8 +33,15 @@ using framework::ir::Node;
...
@@ -33,8 +33,15 @@ using framework::ir::Node;
std
::
vector
<
std
::
string
>
ExtractParameters
(
std
::
vector
<
std
::
string
>
ExtractParameters
(
const
std
::
unordered_set
<
Node
*>
&
nodes
);
const
std
::
unordered_set
<
Node
*>
&
nodes
);
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
analysis
::
TensorRtSubgraphPass
::
ApplyImpl
(
void
RenameAndGetOutputs
(
const
std
::
vector
<
framework
::
ir
::
Node
*>
&
subgraph_nodes
,
framework
::
BlockDesc
*
block_desc
,
const
std
::
set
<
std
::
string
>
&
input_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
output_name_map
);
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
analysis
::
TensorRtSubgraphPass
::
ApplyImpl
(
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
...
@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
fuser
();
fuser
();
std
::
vector
<
std
::
string
>
graph_param_names
=
ExtractParameters
(
graph
->
Nodes
());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
Agent
(
node
).
subgraph
()
->
empty
())
{
if
(
node
->
IsOp
()
&&
!
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateTensorRTOp
(
node
,
graph
.
get
());
CreateTensorRTOp
(
node
,
graph
.
get
(),
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
Agent
(
node
).
subgraph
()
->
begin
(),
Agent
(
node
).
subgraph
()
->
end
());
Agent
(
node
).
subgraph
()
->
begin
(),
Agent
(
node
).
subgraph
()
->
end
());
...
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
}
}
}
}
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
graph
->
Set
(
framework
::
ir
::
kRepetitiveParamAttr
,
new
std
::
vector
<
std
::
string
>
(
repetitive_params
));
return
graph
;
return
graph
;
}
}
std
::
string
GenerateEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
std
::
string
GenerateEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
const
std
::
set
<
std
::
string
>
&
engine_outputs
)
{
const
std
::
set
<
std
::
string
>
&
engine_outputs
,
const
std
::
string
&
predictor_id
)
{
std
::
string
engine_hash_key
=
""
;
std
::
string
engine_hash_key
=
""
;
for
(
auto
name
:
engine_inputs
)
{
for
(
auto
name
:
engine_inputs
)
{
engine_hash_key
+=
name
;
engine_hash_key
+=
name
;
...
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
...
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
for
(
auto
name
:
engine_outputs
)
{
for
(
auto
name
:
engine_outputs
)
{
engine_hash_key
+=
name
;
engine_hash_key
+=
name
;
}
}
engine_hash_key
+=
predictor_id
;
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
return
engine_key
;
return
engine_key
;
}
}
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
Graph
*
graph
)
const
{
framework
::
ir
::
Node
*
node
,
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
{
auto
*
op_desc
=
node
->
Op
();
auto
*
op_desc
=
node
->
Op
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
...
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique.
// is unique.
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names_with_id
;
std
::
set
<
std
::
string
>
input_names_with_id
;
std
::
vector
<
std
::
string
>
params
;
// The node->inputs containes input tensors and parameters.
for
(
auto
*
x
:
node
->
inputs
)
{
for
(
auto
*
x
:
node
->
inputs
)
{
input_names
.
insert
(
x
->
Name
());
input_names
.
insert
(
x
->
Name
());
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
if
(
std
::
count
(
graph_params
.
begin
(),
graph_params
.
end
(),
x
->
Name
())
>
0
)
{
params
.
push_back
(
x
->
Name
());
}
}
}
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
std
::
set
<
std
::
string
>
output_names
;
std
::
set
<
std
::
string
>
output_names
;
std
::
set
<
std
::
string
>
output_names_with_id
;
std
::
set
<
std
::
string
>
output_names_with_id
;
...
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
output_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
}
}
op_desc
->
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
(
output_names
.
begin
(),
output_names
.
end
()));
op_desc
->
SetType
(
"tensorrt_engine"
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
output_name_map
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
output_name_map
;
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
// The following procedure is used to rename all the intermediate
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// variables and the output variables of the subgraph.
...
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// input of a OP, but also the output of a Op, there will be problems.
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
// it is either an OP's input or an OP's output.
RenameAndGetOutputs
(
subgraph_nodes
,
&
block_desc
,
input_names_with_id
,
auto
&
subgraph_nodes
=
*
Agent
(
node
).
subgraph
();
&
output_names_with_id
,
&
output_names
,
&
output_name_map
);
for
(
size_t
index
=
0
;
index
<
block_desc
.
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
.
Op
(
index
)
->
Proto
();
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
for
(
auto
*
in_var
:
correspond_node
->
inputs
)
{
var2id
[
in_var
->
Name
()]
=
in_var
->
id
();
}
// rename for the input variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
inputs_size
();
i
++
)
{
// one input
auto
*
in_var
=
op
->
mutable_inputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
in_var
->
arguments_size
();
k
++
)
{
// all the arguments
std
::
string
arg_value
=
in_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
input_names_with_id
.
count
(
arg_value_with_id
))
{
replaced_names
.
push_back
(
arg_value
);
}
else
{
replaced_names
.
push_back
(
arg_value_with_id
);
}
}
in_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
in_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
var2id
.
clear
();
for
(
auto
out_var
:
correspond_node
->
outputs
)
{
var2id
[
out_var
->
Name
()]
=
out_var
->
id
();
}
// rename for the output variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
outputs_size
();
i
++
)
{
framework
::
proto
::
OpDesc_Var
*
out_var
=
op
->
mutable_outputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
out_var
->
arguments_size
();
k
++
)
{
std
::
string
arg_value
=
out_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
output_names_with_id
.
count
(
arg_value_with_id
))
{
output_name_map
[
arg_value
]
=
arg_value_with_id
;
}
replaced_names
.
push_back
(
arg_value_with_id
);
}
out_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
out_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
}
// When tensorrt engine runs at the end of the operation,
// When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
// output_mapping help us copy the data from the renamed ITensor
...
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE
(
output_name_map
.
count
(
name
)
!=
0
);
PADDLE_ENFORCE
(
output_name_map
.
count
(
name
)
!=
0
);
output_mapping
.
push_back
(
output_name_map
[
name
]);
output_mapping
.
push_back
(
output_name_map
[
name
]);
}
}
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
auto
*
vars
=
block_desc
.
Proto
()
->
mutable_vars
();
auto
*
vars
=
block_desc
.
Proto
()
->
mutable_vars
();
for
(
framework
::
ir
::
Node
*
node
:
graph
->
Nodes
())
{
for
(
framework
::
ir
::
Node
*
node
:
graph
->
Nodes
())
{
...
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE
(
!
block_desc
.
Proto
()
->
vars
().
empty
(),
PADDLE_ENFORCE
(
!
block_desc
.
Proto
()
->
vars
().
empty
(),
"the block has no var-desc"
);
"the block has no var-desc"
);
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
// Set attrs
op_desc
->
SetType
(
"tensorrt_engine"
);
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
op_desc
->
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
(
output_names
.
begin
(),
output_names
.
end
()));
op_desc
->
SetBlockAttr
(
"sub_block"
,
new_block
);
op_desc
->
SetBlockAttr
(
"sub_block"
,
new_block
);
SetAttr
(
op_desc
->
Proto
(),
"subgraph"
,
SetAttr
(
op_desc
->
Proto
(),
"subgraph"
,
block_desc
.
Proto
()
->
SerializeAsString
());
block_desc
.
Proto
()
->
SerializeAsString
());
// Set attrs
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
params
);
auto
enable_int8
=
Get
<
bool
>
(
"enable_int8"
);
auto
enable_int8
=
Get
<
bool
>
(
"enable_int8"
);
auto
engine_key
=
auto
engine_key
=
GenerateEngineKey
(
input_names_with_id
,
output_names_with_id
,
GenerateEngineKey
(
input_names_with_id
,
output_names_with_id
);
std
::
to_string
(
0
)
);
// Get "" when there is no cached calibration table data.
std
::
string
calibration_data
=
GetTrtCalibTableData
(
std
::
string
calibration_data
=
GetTrtCalibTableData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
,
enable_int8
);
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"calibration_data"
,
calibration_data
);
SetAttr
(
op_desc
->
Proto
(),
"calibration_data"
,
calibration_data
);
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
std
::
string
(
""
));
std
::
unique_ptr
<
tensorrt
::
TRTInt8Calibrator
>
calibrator
;
if
(
enable_int8
&&
calibration_data
.
size
()
!=
0
)
{
calibrator
.
reset
(
new
tensorrt
::
TRTInt8Calibrator
(
calibration_data
));
}
bool
use_static_engine
=
Get
<
bool
>
(
"use_static_engine"
);
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool
calibration_mode
=
(
enable_int8
&&
calibration_data
.
size
()
==
0
);
if
(
!
calibration_mode
&&
use_static_engine
)
{
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
std
::
string
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
if
(
trt_engine_serialized_data
.
empty
())
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
std
::
unique_ptr
<
tensorrt
::
TensorRTEngine
>
trt_engine
(
new
tensorrt
::
TensorRTEngine
(
Get
<
int
>
(
"max_batch_size"
),
Get
<
int
>
(
"workspace_size"
),
enable_int8
,
calibrator
.
get
(),
Get
<
int
>
(
"gpu_device_id"
)));
auto
*
scope
=
param_scope
();
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
.
Proto
());
std
::
unordered_set
<
std
::
string
>
param_set
(
params
.
begin
(),
params
.
end
());
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlockToTRTEngine
(
&
block_desc_temp
,
*
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
trt_engine
.
get
());
nvinfer1
::
IHostMemory
*
serialized_engine_data
=
trt_engine
->
Serialize
();
trt_engine_serialized_data
=
std
::
string
((
const
char
*
)
serialized_engine_data
->
data
(),
serialized_engine_data
->
size
());
SaveTrtEngineSerializedDataToFile
(
GetTrtEngineSerializedPath
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
),
trt_engine_serialized_data
);
}
else
{
LOG
(
INFO
)
<<
"Load TRT Optimized Info from "
<<
GetTrtEngineSerializedPath
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
}
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
trt_engine_serialized_data
);
}
}
}
std
::
vector
<
std
::
string
>
ExtractParameters
(
std
::
vector
<
std
::
string
>
ExtractParameters
(
...
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
...
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
for
(
const
auto
&
node
:
nodes
)
{
for
(
const
auto
&
node
:
nodes
)
{
if
(
!
node
->
IsOp
())
continue
;
if
(
!
node
->
IsOp
())
continue
;
std
::
string
op_type
=
node
->
Op
()
->
Type
();
std
::
string
op_type
=
node
->
Op
()
->
Type
();
if
(
op_type
==
"feed"
)
{
if
(
op_type
==
"feed"
||
op_type
==
"fetch"
)
{
std
::
vector
<
std
::
string
>
output_names
=
node
->
Op
()
->
OutputArgumentNames
();
std
::
vector
<
std
::
string
>
output_names
=
node
->
Op
()
->
OutputArgumentNames
();
std
::
copy
(
output_names
.
begin
(),
output_names
.
end
(),
std
::
copy
(
output_names
.
begin
(),
output_names
.
end
(),
std
::
back_inserter
(
feed_outputs
));
std
::
back_inserter
(
feed_outputs
));
...
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
...
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
return
parameters
;
return
parameters
;
}
}
void
RenameAndGetOutputs
(
const
std
::
vector
<
framework
::
ir
::
Node
*>
&
subgraph_nodes
,
framework
::
BlockDesc
*
block_desc
,
const
std
::
set
<
std
::
string
>
&
input_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names_with_id
,
std
::
set
<
std
::
string
>
*
output_names
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
*
output_name_map
)
{
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
same_hierarchy_conv2d_num_map
;
for
(
size_t
index
=
0
;
index
<
block_desc
->
OpSize
();
++
index
)
{
framework
::
proto
::
OpDesc
*
op
=
block_desc
->
Op
(
index
)
->
Proto
();
framework
::
OpDesc
op_desc
(
*
op
,
nullptr
);
auto
correspond_node
=
subgraph_nodes
[
index
];
PADDLE_ENFORCE_EQ
(
correspond_node
->
Name
(),
op
->
type
());
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
std
::
unordered_map
<
std
::
string
,
framework
::
ir
::
Node
*>
in_vars
;
for
(
auto
*
in_var
:
correspond_node
->
inputs
)
{
var2id
[
in_var
->
Name
()]
=
in_var
->
id
();
in_vars
[
in_var
->
Name
()]
=
in_var
;
}
// rename for the input variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
inputs_size
();
i
++
)
{
// one input
auto
*
in_var
=
op
->
mutable_inputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
in_var
->
arguments_size
();
k
++
)
{
// all the arguments
std
::
string
arg_value
=
in_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
input_names_with_id
.
count
(
arg_value_with_id
))
{
replaced_names
.
push_back
(
arg_value
);
}
else
{
replaced_names
.
push_back
(
arg_value_with_id
);
}
}
in_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
in_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
var2id
.
clear
();
for
(
auto
out_var
:
correspond_node
->
outputs
)
{
var2id
[
out_var
->
Name
()]
=
out_var
->
id
();
}
if
(
op_desc
.
Type
()
==
"conv2d"
)
{
auto
input_var_name
=
op_desc
.
Input
(
"Input"
).
front
();
auto
filter_var_name
=
op_desc
.
Input
(
"Filter"
).
front
();
auto
out_var_name
=
op_desc
.
Output
(
"Output"
).
front
();
auto
filter_shape
=
in_vars
[
filter_var_name
]
->
Var
()
->
GetShape
();
const
std
::
vector
<
int
>
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
if
(
same_hierarchy_conv2d_num_map
[
input_var_name
]
>
0
)
{
(
*
output_names_with_id
)
.
insert
(
out_var_name
+
std
::
to_string
(
var2id
[
out_var_name
]));
(
*
output_names
).
insert
(
out_var_name
);
}
else
if
(
filter_shape
[
2
]
==
1
&&
filter_shape
[
3
]
==
1
&&
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
&&
paddings
[
0
]
==
0
&&
paddings
[
1
]
==
0
)
{
same_hierarchy_conv2d_num_map
[
input_var_name
]
+=
1
;
}
}
// rename for the output variables of op inside subgraph
for
(
int
i
=
0
;
i
<
op
->
outputs_size
();
i
++
)
{
framework
::
proto
::
OpDesc_Var
*
out_var
=
op
->
mutable_outputs
(
i
);
std
::
vector
<
std
::
string
>
replaced_names
;
for
(
int
k
=
0
;
k
<
out_var
->
arguments_size
();
k
++
)
{
std
::
string
arg_value
=
out_var
->
arguments
(
k
);
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
output_names_with_id
->
count
(
arg_value_with_id
))
{
(
*
output_name_map
)[
arg_value
]
=
arg_value_with_id
;
}
replaced_names
.
push_back
(
arg_value_with_id
);
}
out_var
->
clear_arguments
();
for
(
size_t
k
=
0
;
k
<
replaced_names
.
size
();
k
++
)
{
out_var
->
add_arguments
(
replaced_names
[
k
]);
}
}
}
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
浏览文件 @
2c4fcaa6
...
@@ -13,7 +13,12 @@
...
@@ -13,7 +13,12 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
...
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
override
;
private:
private:
void
CreateTensorRTOp
(
framework
::
ir
::
Node
*
x
,
void
CreateTensorRTOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
,
framework
::
ir
::
Graph
*
graph
)
const
;
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
;
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
};
};
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
2c4fcaa6
...
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
...
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
// The parameters are on the cpu, therefore, synchronization is not necessary.
if
(
!
argument
->
use_gpu
())
return
;
if
(
!
argument
->
use_gpu
())
return
;
auto
&
graph
=
argument
->
main_graph
();
std
::
vector
<
std
::
string
>
repetitive_params
;
if
(
graph
.
Has
(
framework
::
ir
::
kRepetitiveParamAttr
))
repetitive_params
=
graph
.
Get
<
std
::
vector
<
std
::
string
>>
(
framework
::
ir
::
kRepetitiveParamAttr
);
LOG
(
INFO
)
<<
"Sync params from CPU to GPU"
;
LOG
(
INFO
)
<<
"Sync params from CPU to GPU"
;
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
...
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
...
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
// the program in the analysis pass.
for
(
auto
&
var_name
:
all_vars
)
{
for
(
auto
&
var_name
:
all_vars
)
{
if
(
std
::
count
(
repetitive_params
.
begin
(),
repetitive_params
.
end
(),
var_name
))
{
continue
;
}
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
);
PADDLE_ENFORCE
(
var
!=
nullptr
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
()
||
if
(
var
->
IsType
<
framework
::
LoDTensor
>
()
||
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
浏览文件 @
2c4fcaa6
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
2c4fcaa6
...
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
...
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
CP_MEMBER
(
trt_use_static_engine_
);
// MKLDNN related.
// MKLDNN related.
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
...
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
...
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
void
AnalysisConfig
::
EnableTensorRtEngine
(
void
AnalysisConfig
::
EnableTensorRtEngine
(
int
workspace_size
,
int
max_batch_size
,
int
min_subgraph_size
,
int
workspace_size
,
int
max_batch_size
,
int
min_subgraph_size
,
AnalysisConfig
::
Precision
precision_mode
)
{
AnalysisConfig
::
Precision
precision_mode
,
bool
use_static
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
!
use_gpu
())
{
if
(
!
use_gpu
())
{
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
...
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
...
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_precision_mode_
=
precision_mode
;
tensorrt_precision_mode_
=
precision_mode
;
trt_use_static_engine_
=
use_static
;
Update
();
Update
();
#else
#else
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
2c4fcaa6
...
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
...
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
int
batch_size
)
{
if
(
UNLIKELY
(
config_
.
cpu_math_library_num_threads
()
>
1
))
{
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
());
}
VLOG
(
3
)
<<
"Predictor::predict"
;
VLOG
(
3
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
inference
::
Timer
timer
;
timer
.
tic
();
timer
.
tic
();
...
@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
argument_
.
SetTensorRtMinSubgraphSize
(
config_
.
tensorrt_min_subgraph_size_
);
argument_
.
SetTensorRtMinSubgraphSize
(
config_
.
tensorrt_min_subgraph_size_
);
argument_
.
SetTensorRtPrecisionMode
(
config_
.
tensorrt_precision_mode_
);
argument_
.
SetTensorRtPrecisionMode
(
config_
.
tensorrt_precision_mode_
);
argument_
.
SetTensorRtUseStaticEngine
(
config_
.
trt_use_static_engine_
);
}
}
if
(
config_
.
use_mkldnn_
)
{
if
(
config_
.
use_mkldnn_
)
{
...
@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
...
@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
}
}
feeds_
[
idx
]
=
op
;
feeds_
[
idx
]
=
op
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
idx2feeds_
[
idx
]
=
op
->
Output
(
"Out"
)[
0
];
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetches_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
if
(
fetches_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetches_
.
resize
(
idx
+
1
);
fetches_
.
resize
(
idx
+
1
);
}
}
fetches_
[
idx
]
=
op
;
fetches_
[
idx
]
=
op
;
idx2fetches_
[
idx
]
=
op
->
Input
(
"X"
)[
0
];
}
}
}
}
}
}
...
@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
...
@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var
->
GetMutable
<
framework
::
FeedFetchList
>
();
var
->
GetMutable
<
framework
::
FeedFetchList
>
();
}
}
std
::
vector
<
std
::
string
>
AnalysisPredictor
::
GetInputNames
()
{
std
::
vector
<
std
::
string
>
input_names
;
for
(
auto
&
item
:
idx2feeds_
)
{
input_names
.
push_back
(
item
.
second
);
}
return
input_names
;
}
std
::
vector
<
std
::
string
>
AnalysisPredictor
::
GetOutputNames
()
{
std
::
vector
<
std
::
string
>
output_names
;
for
(
auto
&
item
:
idx2fetches_
)
{
output_names
.
push_back
(
item
.
second
);
}
return
output_names
;
}
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetInputTensor
(
std
::
unique_ptr
<
ZeroCopyTensor
>
AnalysisPredictor
::
GetInputTensor
(
const
std
::
string
&
name
)
{
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
PADDLE_ENFORCE
(
executor_
->
scope
()
->
FindVar
(
name
),
"no name called %s"
,
name
);
...
@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
...
@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
true
;
res
->
input_or_output_
=
true
;
res
->
SetName
(
name
);
res
->
SetName
(
name
);
if
(
platform
::
is_cpu_place
(
place_
))
{
res
->
SetPlace
(
PaddlePlace
::
kCPU
);
}
else
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
res
->
SetPlace
(
PaddlePlace
::
kGPU
,
gpu_place
.
GetDeviceId
());
}
return
res
;
return
res
;
}
}
...
@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
...
@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
new
ZeroCopyTensor
(
static_cast
<
void
*>
(
executor_
->
scope
())));
res
->
input_or_output_
=
false
;
res
->
input_or_output_
=
false
;
res
->
SetName
(
name
);
res
->
SetName
(
name
);
if
(
platform
::
is_cpu_place
(
place_
))
{
res
->
SetPlace
(
PaddlePlace
::
kCPU
);
}
else
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
res
->
SetPlace
(
PaddlePlace
::
kGPU
,
gpu_place
.
GetDeviceId
());
}
return
res
;
return
res
;
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
2c4fcaa6
...
@@ -15,12 +15,14 @@
...
@@ -15,12 +15,14 @@
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <map>
#include <map>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING
#ifdef PADDLE_WITH_TESTING
...
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
vector
<
PaddleTensor
>
*
output_data
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
=
-
1
)
override
;
int
batch_size
=
-
1
)
override
;
std
::
vector
<
std
::
string
>
GetInputNames
();
std
::
vector
<
std
::
string
>
GetOutputNames
();
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
override
;
const
std
::
string
&
name
)
override
;
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
...
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
// Sorted according to the idx.
std
::
map
<
size_t
,
std
::
string
>
idx2feeds_
;
std
::
vector
<
framework
::
OpDesc
*>
fetches_
;
std
::
vector
<
framework
::
OpDesc
*>
fetches_
;
std
::
map
<
size_t
,
std
::
string
>
idx2fetches_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
2c4fcaa6
...
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
...
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
bool
NativePaddlePredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
bool
NativePaddlePredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
int
batch_size
)
{
if
(
UNLIKELY
(
config_
.
cpu_math_library_num_threads
()
>
1
))
{
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
());
}
VLOG
(
3
)
<<
"Predictor::predict"
;
VLOG
(
3
)
<<
"Predictor::predict"
;
Timer
timer
;
Timer
timer
;
timer
.
tic
();
timer
.
tic
();
...
...
paddle/fluid/inference/api/details/zero_copy_tensor.cc
浏览文件 @
2c4fcaa6
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
...
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return
res
;
return
res
;
}
}
template
<
typename
T
>
void
ZeroCopyTensor
::
copy_from_cpu
(
const
T
*
data
)
{
EAGER_GET_TENSOR
;
PADDLE_ENFORCE_GE
(
tensor
->
numel
(),
0
,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu."
);
size_t
ele_size
=
tensor
->
numel
()
*
sizeof
(
T
);
if
(
place_
==
PaddlePlace
::
kCPU
)
{
auto
*
t_data
=
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
memcpy
(
static_cast
<
void
*>
(
t_data
),
data
,
ele_size
);
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
CUDAPlace
gpu_place
(
device_
);
auto
*
t_data
=
tensor
->
mutable_data
<
T
>
(
gpu_place
);
auto
*
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
memory
::
Copy
(
gpu_place
,
static_cast
<
void
*>
(
t_data
),
platform
::
CPUPlace
(),
data
,
ele_size
,
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
}
template
<
typename
T
>
void
ZeroCopyTensor
::
copy_to_cpu
(
T
*
data
)
{
EAGER_GET_TENSOR
;
auto
ele_num
=
tensor
->
numel
();
auto
*
t_data
=
tensor
->
data
<
T
>
();
auto
t_place
=
tensor
->
place
();
if
(
platform
::
is_cpu_place
(
t_place
))
{
std
::
memcpy
(
static_cast
<
void
*>
(
data
),
t_data
,
ele_num
*
sizeof
(
T
));
}
else
{
#ifdef PADDLE_WITH_CUDA
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
t_place
);
auto
*
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
memory
::
Copy
(
platform
::
CPUPlace
(),
static_cast
<
void
*>
(
data
),
gpu_place
,
t_data
,
ele_num
*
sizeof
(
T
),
dev_ctx
->
stream
());
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
}
template
void
ZeroCopyTensor
::
copy_from_cpu
<
float
>(
const
float
*
data
);
template
void
ZeroCopyTensor
::
copy_from_cpu
<
int64_t
>(
const
int64_t
*
data
);
template
void
ZeroCopyTensor
::
copy_to_cpu
<
float
>(
float
*
data
);
template
void
ZeroCopyTensor
::
copy_to_cpu
<
int64_t
>(
int64_t
*
data
);
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
template
float
*
ZeroCopyTensor
::
data
<
float
>(
PaddlePlace
*
place
,
int
*
size
)
const
;
int
*
size
)
const
;
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
template
int64_t
*
ZeroCopyTensor
::
data
<
int64_t
>(
PaddlePlace
*
place
,
...
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
...
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return
tensor
;
return
tensor
;
}
}
std
::
vector
<
int
64_t
>
ZeroCopyTensor
::
shape
()
const
{
std
::
vector
<
int
>
ZeroCopyTensor
::
shape
()
const
{
EAGER_GET_TENSOR
;
EAGER_GET_TENSOR
;
PADDLE_ENFORCE
(
tensor_
,
"not found tensor called %s in the scope"
,
name_
);
PADDLE_ENFORCE
(
tensor_
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
return
framework
::
vectorize
2int
(
tensor
->
dims
());
}
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
...
...
paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
浏览文件 @
2c4fcaa6
...
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
...
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
return
nullptr
;
}
void
*
ZeroCopyTensor
::
FindTensor
()
const
{
return
nullptr
;
}
std
::
vector
<
int
64_t
>
ZeroCopyTensor
::
shape
()
const
{
return
{};
}
std
::
vector
<
int
>
ZeroCopyTensor
::
shape
()
const
{
return
{};
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{}
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
2c4fcaa6
...
@@ -50,6 +50,11 @@ class Timer {
...
@@ -50,6 +50,11 @@ class Timer {
}
}
};
};
static
int
GetUniqueId
()
{
static
int
id
=
0
;
return
id
++
;
}
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
std
::
vector
<
std
::
string
>
*
pieces
)
{
pieces
->
clear
();
pieces
->
clear
();
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
2c4fcaa6
...
@@ -135,7 +135,8 @@ struct AnalysisConfig {
...
@@ -135,7 +135,8 @@ struct AnalysisConfig {
*/
*/
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
,
Precision
precision
=
Precision
::
kFloat32
);
Precision
precision
=
Precision
::
kFloat32
,
bool
use_static
=
true
);
/** A boolean state telling whether the TensorRT engine is used.
/** A boolean state telling whether the TensorRT engine is used.
*/
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
...
@@ -233,6 +234,7 @@ struct AnalysisConfig {
...
@@ -233,6 +234,7 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
int
tensorrt_min_subgraph_size_
{
3
};
Precision
tensorrt_precision_mode_
;
Precision
tensorrt_precision_mode_
;
bool
trt_use_static_engine_
;
// memory reuse related.
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
enable_memory_optim_
{
false
};
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
2c4fcaa6
...
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
...
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
template
<
typename
T
>
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
std
::
vector
<
int64_t
>
shape
()
const
;
template
<
typename
T
>
void
copy_from_cpu
(
const
T
*
data
);
template
<
typename
T
>
void
copy_to_cpu
(
T
*
data
);
std
::
vector
<
int
>
shape
()
const
;
void
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>&
x
);
void
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>&
x
);
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
const
std
::
string
&
name
()
const
{
return
name_
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetPlace
(
PaddlePlace
place
,
int
device
=
-
1
)
{
place_
=
place
;
device_
=
device
;
}
protected:
protected:
explicit
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
explicit
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
...
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
...
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
// performance.
mutable
void
*
tensor_
{
nullptr
};
mutable
void
*
tensor_
{
nullptr
};
PaddlePlace
place_
;
int
device_
;
};
};
/** A simple Inference API for Paddle.
/** A simple Inference API for Paddle.
...
@@ -200,6 +212,14 @@ class PaddlePredictor {
...
@@ -200,6 +212,14 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
int
batch_size
=
-
1
)
=
0
;
/** \brief Get input names of the model
*/
virtual
std
::
vector
<
std
::
string
>
GetInputNames
()
{
return
{};
}
/** \brief Get output names of the model
*/
virtual
std
::
vector
<
std
::
string
>
GetOutputNames
()
{
return
{};
}
/** \brief Get a mutable tensor directly.
/** \brief Get a mutable tensor directly.
*
*
* NOTE Only works in AnalysisPredictor.
* NOTE Only works in AnalysisPredictor.
...
...
paddle/fluid/inference/engine.h
浏览文件 @
2c4fcaa6
...
@@ -49,11 +49,6 @@ class EngineBase {
...
@@ -49,11 +49,6 @@ class EngineBase {
// Execute the engine, that will run the inference network.
// Execute the engine, that will run the inference network.
virtual
void
Execute
(
int
batch_size
)
=
0
;
virtual
void
Execute
(
int
batch_size
)
=
0
;
// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual
Buffer
&
buffer
(
const
std
::
string
&
name
)
=
0
;
virtual
~
EngineBase
()
{}
virtual
~
EngineBase
()
{}
};
// class EngineBase
};
// class EngineBase
...
...
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
2c4fcaa6
...
@@ -18,21 +18,6 @@ namespace paddle {
...
@@ -18,21 +18,6 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
bool
to_skip_merging_optimize
(
TensorRTEngine
*
engine
,
const
std
::
vector
<
int
>&
filters
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
std
::
string
input_name
)
{
if
(
engine
->
itensor_quote_num
[
input_name
]
>
0
)
{
return
true
;
}
if
(
filters
[
0
]
==
1
&&
filters
[
1
]
==
1
&&
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
&&
paddings
[
0
]
==
0
&&
paddings
[
1
]
==
0
)
engine
->
itensor_quote_num
[
input_name
]
+=
1
;
return
false
;
}
template
<
typename
RegistFunc
,
typename
SetDilationFunc
>
template
<
typename
RegistFunc
,
typename
SetDilationFunc
>
void
ConvertConv2d
(
TensorRTEngine
*
engine
,
const
framework
::
proto
::
OpDesc
&
op
,
void
ConvertConv2d
(
TensorRTEngine
*
engine
,
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
,
const
framework
::
Scope
&
scope
,
bool
test_mode
,
...
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
...
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
weight_tensor
->
Resize
(
Y_t
->
dims
());
weight_tensor
->
Resize
(
Y_t
->
dims
());
TensorCopySync
((
*
Y_t
),
cpu_place
,
weight_tensor
.
get
());
TensorCopySync
((
*
Y_t
),
cpu_place
,
weight_tensor
.
get
());
auto
*
weight_data
=
weight_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()
);
auto
*
weight_data
=
weight_tensor
->
mutable_data
<
float
>
(
cpu_place
);
PADDLE_ENFORCE_EQ
(
weight_tensor
->
dims
().
size
(),
4UL
);
PADDLE_ENFORCE_EQ
(
weight_tensor
->
dims
().
size
(),
4UL
);
const
int
n_output
=
weight_tensor
->
dims
()[
0
];
const
int
n_output
=
weight_tensor
->
dims
()[
0
];
...
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
...
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
engine
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
||
if
(
test_mode
)
{
to_skip_merging_optimize
(
engine
,
{
filter_h
,
filter_w
},
strides
,
paddings
,
op_desc
.
Input
(
"Input"
).
front
()))
{
engine
->
DeclareOutput
(
output_name
);
engine
->
DeclareOutput
(
output_name
);
}
}
}
}
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
2c4fcaa6
...
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
...
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
if
(
CheckDims
(
dims_x
,
dims_y
))
{
if
(
CheckDims
(
dims_x
,
dims_y
))
{
// The two input tensor should have the same dims
// The two input tensor should have the same dims
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IElementWiseLayer"
;
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IElementWiseLayer"
;
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
...
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
...
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
"ElementWisePluginLayer"
;
"ElementWisePluginLayer"
;
plugin
::
ElementWisePlugin
*
plugin
=
plugin
::
ElementWisePlugin
*
plugin
=
new
plugin
::
ElementWisePlugin
(
op_
pair
->
second
,
dims_x
,
dims_y
,
axis
);
new
plugin
::
ElementWisePlugin
(
op_
type_
,
dims_x
,
dims_y
,
axis
);
plugin
->
AddInput
(
X
);
plugin
->
AddInput
(
X
);
plugin
->
AddInput
(
Y
);
plugin
->
AddInput
(
Y
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
2c4fcaa6
...
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
...
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
Y_t
->
dims
()[
0
]
*
Y_t
->
dims
()[
1
]
*
sizeof
(
float
));
Y_t
->
dims
()[
0
]
*
Y_t
->
dims
()[
1
]
*
sizeof
(
float
));
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
static_cast
<
size_t
>
(
Y_t
->
numel
()
)};
TensorRTEngine
::
Weight
tmp_weight
(
nvinfer1
::
DataType
::
kFLOAT
,
TensorRTEngine
::
Weight
tmp_weight
(
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
tmp
->
data
<
float
>
()),
static_cast
<
void
*>
(
tmp
->
data
<
float
>
()),
Y_t
->
memory_size
()
/
sizeof
(
float
));
static_cast
<
size_t
>
(
Y_t
->
numel
()
));
weight
.
dims
.
assign
({
Y_t
->
dims
()[
0
],
Y_t
->
dims
()[
1
]});
weight
.
dims
.
assign
({
Y_t
->
dims
()[
0
],
Y_t
->
dims
()[
1
]});
tmp_weight
.
dims
=
weight
.
dims
;
tmp_weight
.
dims
=
weight
.
dims
;
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
2c4fcaa6
...
@@ -16,9 +16,12 @@ limitations under the License. */
...
@@ -16,9 +16,12 @@ limitations under the License. */
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
...
@@ -26,6 +29,37 @@ namespace paddle {
...
@@ -26,6 +29,37 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
// NOLINT
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
case
FluidDT
::
VarType_Type_FP32
:
return
TRT_DT
::
kFLOAT
;
case
FluidDT
::
VarType_Type_INT32
:
return
TRT_DT
::
kINT32
;
default:
return
TRT_DT
::
kINT32
;
}
PADDLE_THROW
(
"unkown type"
);
return
TRT_DT
::
kINT32
;
}
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>&
shape
)
{
PADDLE_ENFORCE_GT
(
shape
.
size
(),
1UL
,
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
"TensorRT' tensor input requires at most 4 dimensions"
);
PADDLE_ENFORCE
(
shape
.
size
()
==
4UL
||
shape
.
size
()
==
2UL
);
if
(
shape
.
size
()
==
4UL
)
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
shape
[
2
],
shape
[
3
]);
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
1
,
1
);
}
}
// namespace // NOLINT
/*
/*
* Convert Op from Fluid to TensorRT Engine.
* Convert Op from Fluid to TensorRT Engine.
*/
*/
...
@@ -110,6 +144,34 @@ class OpConverter {
...
@@ -110,6 +144,34 @@ class OpConverter {
}
}
}
}
// The scope here should be inited with the parameter vars.
void
ConvertBlockToTRTEngine
(
framework
::
BlockDesc
*
block_desc
,
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
const
std
::
vector
<
std
::
string
>&
outputs
,
TensorRTEngine
*
engine
)
{
engine
->
InitNetwork
();
for
(
auto
&
input
:
inputs
)
{
if
(
parameters
.
count
(
input
))
continue
;
auto
*
var
=
block_desc
->
FindVar
(
input
);
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
auto
var_shape
=
var
->
GetShape
();
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
var_shape
));
}
framework
::
proto
::
BlockDesc
*
block_proto
=
block_desc
->
Proto
();
ConvertBlock
(
*
block_proto
,
parameters
,
scope
,
engine
);
for
(
auto
&
output
:
outputs
)
{
engine
->
DeclareOutput
(
output
);
}
engine
->
FreezeNetwork
();
}
void
SetEngine
(
TensorRTEngine
*
engine
)
{
engine_
=
engine
;
}
void
SetEngine
(
TensorRTEngine
*
engine
)
{
engine_
=
engine
;
}
virtual
~
OpConverter
()
{}
virtual
~
OpConverter
()
{}
...
...
paddle/fluid/inference/tensorrt/convert/prelu_op.cc
浏览文件 @
2c4fcaa6
...
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
...
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL
(
alpha_var
);
PADDLE_ENFORCE_NOT_NULL
(
alpha_var
);
auto
*
alpha_tensor
=
alpha_var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
alpha_tensor
=
alpha_var
->
GetMutable
<
framework
::
LoDTensor
>
();
platform
::
C
UDAPlace
place
;
platform
::
C
PUPlace
cpu_
place
;
std
::
unique_ptr
<
framework
::
LoDTensor
>
alpha_tensor_
device
(
std
::
unique_ptr
<
framework
::
LoDTensor
>
alpha_tensor_
temp
(
new
framework
::
LoDTensor
());
new
framework
::
LoDTensor
());
alpha_tensor_
device
->
Resize
(
alpha_tensor
->
dims
());
alpha_tensor_
temp
->
Resize
(
alpha_tensor
->
dims
());
TensorCopySync
(
*
alpha_tensor
,
place
,
alpha_tensor_device
.
get
());
TensorCopySync
(
*
alpha_tensor
,
cpu_place
,
alpha_tensor_temp
.
get
());
float
*
alpha_data
=
alpha_tensor_
device
->
mutable_data
<
float
>
(
place
);
float
*
alpha_data
=
alpha_tensor_
temp
->
mutable_data
<
float
>
(
cpu_
place
);
// Transform alpha to TensorRTEngine::Weight
plugin
::
PReluPlugin
*
plugin
=
TensorRTEngine
::
Weight
alpha_rt
(
nvinfer1
::
DataType
::
kFLOAT
,
new
plugin
::
PReluPlugin
(
alpha_data
,
alpha_tensor_temp
->
numel
(),
mode
);
static_cast
<
void
*>
(
alpha_data
),
alpha_tensor_device
->
numel
());
plugin
::
PReluPlugin
*
plugin
=
new
plugin
::
PReluPlugin
(
alpha_rt
,
mode
);
nvinfer1
::
IPluginLayer
*
layer
=
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
// keep alpha tensor to avoid release it's memory
// keep alpha tensor to avoid release it's memory
engine_
->
weight_map
[
op_desc
.
Input
(
"Alpha"
)[
0
]]
=
engine_
->
weight_map
[
op_desc
.
Input
(
"Alpha"
)[
0
]]
=
std
::
move
(
alpha_tensor_
device
);
std
::
move
(
alpha_tensor_
temp
);
std
::
string
layer_name
=
"prelu (Output: "
;
std
::
string
layer_name
=
"prelu (Output: "
;
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
2c4fcaa6
...
@@ -19,7 +19,9 @@ limitations under the License. */
...
@@ -19,7 +19,9 @@ limitations under the License. */
#pragma once
#pragma once
#include <memory>
#include <string>
#include <string>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
...
@@ -79,7 +81,8 @@ class TRTConvertValidation {
...
@@ -79,7 +81,8 @@ class TRTConvertValidation {
if_add_batch_
(
if_add_batch
),
if_add_batch_
(
if_add_batch
),
max_batch_size_
(
max_batch_size
)
{
max_batch_size_
(
max_batch_size
)
{
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size
,
workspace_size
,
stream_
));
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size
,
workspace_size
,
false
,
nullptr
,
0
));
engine_
->
InitNetwork
();
engine_
->
InitNetwork
();
}
}
...
@@ -114,13 +117,12 @@ class TRTConvertValidation {
...
@@ -114,13 +117,12 @@ class TRTConvertValidation {
}
}
void
DeclVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
dim_vec
)
{
void
DeclVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
dim_vec
)
{
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place_
);
platform
::
CUDADeviceContext
ctx
(
place
);
auto
*
x
=
scope_
.
Var
(
name
);
auto
*
x
=
scope_
.
Var
(
name
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
RandomizeTensor
(
x_tensor
,
place
,
ctx
);
RandomizeTensor
(
x_tensor
,
place
_
,
ctx
);
}
}
// Declare a variable in a fluid Scope.
// Declare a variable in a fluid Scope.
void
DeclVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
,
void
DeclVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
,
...
@@ -146,19 +148,6 @@ class TRTConvertValidation {
...
@@ -146,19 +148,6 @@ class TRTConvertValidation {
// Declare outputs.
// Declare outputs.
op_desc_
.
reset
(
new
framework
::
OpDesc
(
desc
,
nullptr
));
op_desc_
.
reset
(
new
framework
::
OpDesc
(
desc
,
nullptr
));
// Set Inputs.
for
(
const
auto
&
input
:
op_desc_
->
InputArgumentNames
())
{
if
(
parameters_
.
count
(
input
))
continue
;
auto
*
var
=
scope_
.
FindVar
(
input
);
PADDLE_ENFORCE
(
var
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
engine_
->
SetInputFromGPU
(
input
,
static_cast
<
void
*>
(
tensor
->
data
<
void
>
()),
sizeof
(
float
)
*
analysis
::
AccuDims
(
tensor
->
dims
(),
tensor
->
dims
().
size
()));
}
}
}
// We use the set 'neglected_output' here, because some Ops like batch norm,
// We use the set 'neglected_output' here, because some Ops like batch norm,
...
@@ -168,43 +157,71 @@ class TRTConvertValidation {
...
@@ -168,43 +157,71 @@ class TRTConvertValidation {
std
::
unordered_set
<
std
::
string
>
neglected_output
=
{})
{
std
::
unordered_set
<
std
::
string
>
neglected_output
=
{})
{
// Execute Fluid Op
// Execute Fluid Op
PADDLE_ENFORCE_LE
(
batch_size
,
max_batch_size_
);
PADDLE_ENFORCE_LE
(
batch_size
,
max_batch_size_
);
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place_
);
platform
::
CUDADeviceContext
ctx
(
place
);
op_
->
Run
(
scope_
,
place_
);
op_
->
Run
(
scope_
,
place
);
// Execute TRT.
engine_
->
Execute
(
batch_size
);
cudaStreamSynchronize
(
engine_
->
stream
());
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
std
::
vector
<
std
::
string
>
input_output_names
;
const
size_t
output_space_size
=
3000
;
// Note: we need filter the parameter
for
(
const
auto
&
input
:
op_desc_
->
InputArgumentNames
())
{
if
(
parameters_
.
count
(
input
))
continue
;
input_output_names
.
push_back
(
input
);
}
// Collect the fluid outputs.
std
::
vector
<
std
::
vector
<
float
>>
fluid_outs
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
if
(
neglected_output
.
count
(
output
))
continue
;
input_output_names
.
push_back
(
output
);
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
trt_out
(
output_space_size
);
engine_
->
GetOutputInCPU
(
output
,
&
trt_out
[
0
],
output_space_size
);
cudaStreamSynchronize
(
engine_
->
stream
());
auto
*
var
=
scope_
.
FindVar
(
output
);
auto
*
var
=
scope_
.
FindVar
(
output
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
fluid_out
);
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
fluid_out
);
fluid_outs
.
push_back
(
fluid_out
);
}
// Bind input and output for TRT.
const
int
num_bindings
=
input_output_names
.
size
();
std
::
vector
<
void
*>
buffers
(
num_bindings
);
for
(
const
std
::
string
&
name
:
input_output_names
)
{
auto
*
var
=
scope_
.
FindVar
(
name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
const
int
bind_index
=
engine_
->
engine
()
->
getBindingIndex
(
name
.
c_str
());
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
tensor
->
mutable_data
<
float
>
(
place_
));
}
// Execute TRT.
engine_
->
Execute
(
batch_size
,
&
buffers
,
stream_
);
size_t
fluid_out_size
=
fluid_out
.
size
();
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
int
index
=
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
std
::
vector
<
float
>
trt_out
;
auto
*
var
=
scope_
.
FindVar
(
output
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
trt_out
);
size_t
fluid_out_size
=
fluid_outs
[
index
].
size
();
if
(
if_add_batch_
==
true
)
{
if
(
if_add_batch_
==
true
)
{
fluid_out_size
=
fluid_out_size
=
batch_size
*
(
framework
::
product
(
tensor
->
dims
())
/
max_batch_size_
);
batch_size
*
(
framework
::
product
(
tensor
->
dims
())
/
max_batch_size_
);
}
}
// Compare two output
ASSERT_FALSE
(
fluid_out
.
empty
());
for
(
size_t
i
=
0
;
i
<
fluid_out_size
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
fluid_out_size
;
i
++
)
{
// Loose the threshold for CI in different machine model.
// Loose the threshold for CI in different machine model.
EXPECT_LT
(
std
::
abs
(
fluid_out
[
i
]
-
trt_out
[
i
]),
2e-5
);
EXPECT_LT
(
std
::
abs
(
fluid_out
s
[
index
]
[
i
]
-
trt_out
[
i
]),
2e-5
);
}
}
index
+=
1
;
}
}
}
}
framework
::
Scope
&
scope
()
{
return
scope_
;
}
framework
::
Scope
&
scope
()
{
return
scope_
;
}
private:
private:
platform
::
CUDAPlace
place_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
cudaStream_t
stream_
;
cudaStream_t
stream_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
2c4fcaa6
...
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
...
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE
(
false
,
"not implemented"
);
PADDLE_ENFORCE
(
false
,
"not implemented"
);
}
}
void
TensorRTEngine
::
Execute
(
int
batch_size
)
{
void
TensorRTEngine
::
Execute
(
int
batch_size
,
std
::
vector
<
void
*>
*
buffers
,
cudaStream_t
stream
)
{
freshDeviceId
();
freshDeviceId
();
batch_size_
=
batch_size
;
batch_size_
=
batch_size
;
std
::
vector
<
void
*>
buffers
;
infer_context_
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
for
(
auto
&
buf
:
buffers_
)
{
cudaStreamSynchronize
(
stream
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated"
);
PADDLE_ENFORCE_GT
(
buf
.
max_size
,
0
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
buffers
.
push_back
(
buf
.
buffer
);
}
infer_context_
->
enqueue
(
batch_size
,
buffers
.
data
(),
stream_
,
nullptr
);
cudaStreamSynchronize
(
stream_
);
SetRuntimeBatch
(
batch_size
);
SetRuntimeBatch
(
batch_size
);
}
}
TensorRTEngine
::~
TensorRTEngine
()
{
cudaStreamSynchronize
(
stream_
);
// clean buffer
for
(
auto
&
buf
:
buffers_
)
{
if
(
buf
.
device
==
DeviceType
::
GPU
&&
buf
.
buffer
!=
nullptr
)
{
PADDLE_ENFORCE_EQ
(
0
,
cudaFree
(
buf
.
buffer
));
buf
.
buffer
=
nullptr
;
buf
.
max_size
=
0
;
}
}
}
void
TensorRTEngine
::
FreezeNetwork
()
{
void
TensorRTEngine
::
FreezeNetwork
()
{
VLOG
(
3
)
<<
"TRT to freeze network"
;
freshDeviceId
();
freshDeviceId
();
VLOG
(
3
)
<<
"TRT to freeze network"
;
PADDLE_ENFORCE
(
infer_builder_
!=
nullptr
,
PADDLE_ENFORCE
(
infer_builder_
!=
nullptr
,
"Call InitNetwork first to initialize network."
);
"Call InitNetwork first to initialize network."
);
PADDLE_ENFORCE
(
infer_network_
!=
nullptr
,
PADDLE_ENFORCE
(
infer_network_
!=
nullptr
,
...
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
...
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed!"
);
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed!"
);
infer_context_
.
reset
(
infer_engine_
->
createExecutionContext
());
infer_context_
.
reset
(
infer_engine_
->
createExecutionContext
());
// allocate GPU buffers.
buffers_
.
resize
(
buffer_sizes_
.
size
());
for
(
auto
&
item
:
buffer_sizes_
)
{
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if
(
item
.
second
==
0
)
{
auto
slot_offset
=
infer_engine_
->
getBindingIndex
(
item
.
first
.
c_str
());
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
item
.
second
=
kDataTypeSize
[
static_cast
<
int
>
(
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
PADDLE_ENFORCE_GT
(
item
.
second
,
0
);
}
auto
&
buf
=
buffer
(
item
.
first
);
buf
.
max_size
=
item
.
second
*
max_batch_
;
CHECK
(
buf
.
buffer
==
nullptr
);
// buffer should be allocated only once.
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
item
.
second
*
max_batch_
));
buf
.
size
=
0
;
PADDLE_ENFORCE_LE
(
buf
.
max_size
,
1
<<
30
);
// 10G
buf
.
device
=
DeviceType
::
GPU
;
}
}
}
nvinfer1
::
ITensor
*
TensorRTEngine
::
DeclareInput
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
TensorRTEngine
::
DeclareInput
(
const
std
::
string
&
name
,
...
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
...
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
buffer_sizes_
[
name
]
=
0
;
buffer_sizes_
[
name
]
=
0
;
}
}
void
*
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
)
{
return
buffer
(
name
).
buffer
;
}
void
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToDevice
,
stream_
),
0
);
}
void
TensorRTEngine
::
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToHost
,
stream_
));
}
Buffer
&
TensorRTEngine
::
buffer
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"call FreezeNetwork first."
);
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
(),
"tried to access buffer named %s"
,
name
);
auto
slot_offset
=
infer_engine_
->
getBindingIndex
(
name
.
c_str
());
return
buffers_
[
slot_offset
];
}
void
TensorRTEngine
::
SetInputFromCPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
)
{
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
);
PADDLE_ENFORCE_NOT_NULL
(
data
);
PADDLE_ENFORCE_LE
(
size
,
buf
.
max_size
,
"buffer is too small"
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
buf
.
size
=
size
;
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
buf
.
buffer
,
data
,
size
,
cudaMemcpyHostToDevice
,
stream_
));
}
void
TensorRTEngine
::
SetInputFromGPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
)
{
auto
&
buf
=
buffer
(
name
);
buf
.
size
=
size
;
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
);
PADDLE_ENFORCE_LE
(
size
,
buf
.
max_size
,
"buffer is too small"
);
PADDLE_ENFORCE
(
buf
.
device
==
DeviceType
::
GPU
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
buf
.
buffer
,
data
,
size
,
cudaMemcpyDeviceToDevice
,
stream_
));
}
void
TensorRTEngine
::
SetITensor
(
const
std
::
string
&
name
,
void
TensorRTEngine
::
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
)
{
nvinfer1
::
ITensor
*
tensor
)
{
PADDLE_ENFORCE
(
tensor
!=
nullptr
);
PADDLE_ENFORCE
(
tensor
!=
nullptr
);
...
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
...
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
int
TensorRTEngine
::
GetRuntimeBatch
()
{
return
runtime_batch_
;
}
int
TensorRTEngine
::
GetRuntimeBatch
()
{
return
runtime_batch_
;
}
void
TensorRTEngine
::
freshDeviceId
()
{
int
count
;
cudaGetDeviceCount
(
&
count
);
PADDLE_ENFORCE_LT
(
device_
,
count
);
cudaSetDevice
(
device_
);
}
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
IPluginLayer
*
TensorRTEngine
::
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
plugin
)
{
plugin
::
PluginTensorRT
*
plugin
)
{
...
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
...
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
num_inputs
,
*
plugin
);
return
infer_network_
.
get
()
->
addPluginExt
(
inputs
,
num_inputs
,
*
plugin
);
}
}
void
TensorRTEngine
::
freshDeviceId
()
{
int
count
;
cudaGetDeviceCount
(
&
count
);
PADDLE_ENFORCE_LT
(
device_id_
,
count
);
cudaSetDevice
(
device_id_
);
}
}
// namespace tensorrt
}
// namespace tensorrt
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
2c4fcaa6
...
@@ -23,6 +23,7 @@ limitations under the License. */
...
@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
...
@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
...
@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
* There are two alternative ways to use it, one is to build from a paddle
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manully construct the network.
* protobuf model, another way is to manully construct the network.
*/
*/
class
TensorRTEngine
:
public
EngineBase
{
class
TensorRTEngine
{
using
DescType
=
::
paddle
::
framework
::
proto
::
BlockDesc
;
public:
public:
// Weight is model parameter.
// Weight is model parameter.
class
Weight
{
class
Weight
{
...
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
...
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
nvinfer1
::
Weights
w_
;
nvinfer1
::
Weights
w_
;
};
};
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
stream
,
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
bool
enable_int8
=
false
,
int
device
=
0
,
bool
enable_int8
=
false
,
TRTInt8Calibrator
*
calibrator
=
nullptr
,
int
device_id
=
0
,
TRTInt8Calibrator
*
calibrator
=
nullptr
,
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
:
max_batch_
(
max_batch
),
:
max_batch_
(
max_batch
),
max_workspace_
(
max_workspace
),
max_workspace_
(
max_workspace
),
stream_
(
stream
),
device_
(
device
),
enable_int8_
(
enable_int8
),
enable_int8_
(
enable_int8
),
calibrator_
(
calibrator
),
calibrator_
(
calibrator
),
device_id_
(
device_id
),
logger_
(
logger
)
{}
logger_
(
logger
)
{}
virtual
~
TensorRTEngine
();
~
TensorRTEngine
()
{}
// TODO(Superjomn) implement it later when graph segmentation is supported.
// TODO(Superjomn) implement it later when graph segmentation is supported.
void
Build
(
const
DescType
&
paddle_model
)
override
;
void
Build
(
const
DescType
&
paddle_model
);
void
Execute
(
int
batch_size
)
override
;
void
Execute
(
int
batch_size
,
std
::
vector
<
void
*>*
buffers
,
cudaStream_t
stream
);
// Initialize the inference network, so that TensorRT layers can add to this
// Initialize the inference network, so that TensorRT layers can add to this
// network.
// network.
void
InitNetwork
()
{
void
InitNetwork
()
{
freshDeviceId
();
infer_builder_
.
reset
(
createInferBuilder
(
&
logger_
));
infer_builder_
.
reset
(
createInferBuilder
(
&
logger_
));
infer_network_
.
reset
(
infer_builder_
->
createNetwork
());
infer_network_
.
reset
(
infer_builder_
->
createNetwork
());
}
}
...
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
...
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
// Check if the ITensor has been declared
// Check if the ITensor has been declared
bool
HasDeclared
(
const
std
::
string
&
name
);
bool
HasDeclared
(
const
std
::
string
&
name
);
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
// data directly to the buffer to save data copy overhead.
// NOTE this should be used after calling `FreezeNetwork`.
Buffer
&
buffer
(
const
std
::
string
&
name
)
override
;
cudaStream_t
stream
()
{
return
stream_
;
}
// Fill an input from CPU memory with name and size.
void
SetInputFromCPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
);
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be
// accessed directly. Fill an input from GPU memory with name and size.
void
SetInputFromGPU
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
size
);
// Get an output called name, the output of tensorrt is in GPU, so this method
// Return the output's GPU memory address without copy.
void
*
GetOutputInGPU
(
const
std
::
string
&
name
);
// Copy data into dst inside the GPU device.
void
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU.
void
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
);
// Fill an ITensor into map itensor_map_.
void
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
);
void
SetITensor
(
const
std
::
string
&
name
,
nvinfer1
::
ITensor
*
tensor
);
// Get an ITensor called name.
// Get an ITensor called name.
nvinfer1
::
ITensor
*
GetITensor
(
const
std
::
string
&
name
);
nvinfer1
::
ITensor
*
GetITensor
(
const
std
::
string
&
name
);
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
nvinfer1
::
IHostMemory
*
Serialize
()
{
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"You should build engine first and then serialize"
);
ihost_memory_
.
reset
(
infer_engine_
->
serialize
());
return
ihost_memory_
.
get
();
}
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
(),
&
inference
::
Singleton
<
plugin
::
PluginFactoryTensorRT
>::
Global
()));
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed when deserialize engine info.!"
);
infer_context_
.
reset
(
infer_engine_
->
createExecutionContext
());
}
void
SetRuntimeBatch
(
size_t
batch_size
);
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
int
GetRuntimeBatch
();
int
GetDevice
()
{
return
device
_
;
}
int
GetDevice
Id
()
{
return
device_id
_
;
}
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
nvinfer1
::
IPluginLayer
*
AddPlugin
(
nvinfer1
::
ITensor
*
const
*
inputs
,
int
num_inputs
,
plugin
::
PluginTensorRT
*
);
int
num_inputs
,
plugin
::
PluginTensorRT
*
);
...
@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
...
@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
weight_map
;
weight_map
;
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this
// optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
itensor_quote_num
;
private:
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void
freshDeviceId
();
// the max batch size
// the max batch size
int
max_batch_
;
int
max_batch_
;
// the runtime batch size
// the runtime batch size
...
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
...
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses
// the max memory size the engine uses
int
max_workspace_
;
int
max_workspace_
;
cudaStream_t
stream_
;
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
bool
enable_int8_
;
bool
enable_int8_
;
TRTInt8Calibrator
*
calibrator_
;
TRTInt8Calibrator
*
calibrator_
;
// batch size of the current data, will be updated each Executation.
// batch size of the current data, will be updated each Executation.
int
batch_size_
{
-
1
};
int
batch_size_
{
-
1
};
int
device_id_
;
nvinfer1
::
ILogger
&
logger_
;
nvinfer1
::
ILogger
&
logger_
;
std
::
vector
<
Buffer
>
buffers_
;
// max data size for the buffers.
// max data size for the buffers.
std
::
unordered_map
<
std
::
string
/*name*/
,
size_t
/*max size*/
>
buffer_sizes_
;
std
::
unordered_map
<
std
::
string
/*name*/
,
size_t
/*max size*/
>
buffer_sizes_
;
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
...
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
...
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
infer_ptr
<
nvinfer1
::
INetworkDefinition
>
infer_network_
;
infer_ptr
<
nvinfer1
::
INetworkDefinition
>
infer_network_
;
infer_ptr
<
nvinfer1
::
ICudaEngine
>
infer_engine_
;
infer_ptr
<
nvinfer1
::
ICudaEngine
>
infer_engine_
;
infer_ptr
<
nvinfer1
::
IExecutionContext
>
infer_context_
;
infer_ptr
<
nvinfer1
::
IExecutionContext
>
infer_context_
;
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
infer_ptr
<
nvinfer1
::
IHostMemory
>
ihost_memory_
;
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void
freshDeviceId
();
};
// class TensorRTEngine
};
// class TensorRTEngine
// Add an layer__ into engine__ with args ARGS.
// Add an layer__ into engine__ with args ARGS.
// For example:
// For example:
// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
//
//
// Reference
// Reference
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
...
...
paddle/fluid/inference/tensorrt/helper.h
浏览文件 @
2c4fcaa6
...
@@ -17,6 +17,9 @@
...
@@ -17,6 +17,9 @@
#include <NvInfer.h>
#include <NvInfer.h>
#include <cuda.h>
#include <cuda.h>
#include <glog/logging.h>
#include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/dynload/tensorrt.h"
#include "paddle/fluid/platform/dynload/tensorrt.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
...
@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
~
NaiveLogger
()
override
{}
~
NaiveLogger
()
override
{}
};
};
class
NaiveProfiler
:
public
nvinfer1
::
IProfiler
{
public:
typedef
std
::
pair
<
std
::
string
,
float
>
Record
;
std
::
vector
<
Record
>
mProfile
;
virtual
void
reportLayerTime
(
const
char
*
layerName
,
float
ms
)
{
auto
record
=
std
::
find_if
(
mProfile
.
begin
(),
mProfile
.
end
(),
[
&
](
const
Record
&
r
)
{
return
r
.
first
==
layerName
;
});
if
(
record
==
mProfile
.
end
())
mProfile
.
push_back
(
std
::
make_pair
(
layerName
,
ms
));
else
record
->
second
+=
ms
;
}
void
printLayerTimes
()
{
float
totalTime
=
0
;
for
(
size_t
i
=
0
;
i
<
mProfile
.
size
();
i
++
)
{
printf
(
"%-40.40s %4.3fms
\n
"
,
mProfile
[
i
].
first
.
c_str
(),
mProfile
[
i
].
second
);
totalTime
+=
mProfile
[
i
].
second
;
}
printf
(
"Time over all layers: %4.3f
\n
"
,
totalTime
);
}
};
}
// namespace tensorrt
}
// namespace tensorrt
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
浏览文件 @
2c4fcaa6
nv_library
(
tensorrt_plugin
nv_library
(
tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu trt_plugin_factory.cc
avg_pool_op_plugin.cu
avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine prelu
)
DEPS enforce tensorrt_engine prelu
)
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
浏览文件 @
2c4fcaa6
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -20,6 +21,12 @@ namespace inference {
...
@@ -20,6 +21,12 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
AvgPoolPlugin
*
CreateAvgPoolPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
AvgPoolPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"avg_pool_plugin"
,
CreateAvgPoolPluginDeserialize
);
nvinfer1
::
Dims
AvgPoolPlugin
::
getOutputDimensions
(
nvinfer1
::
Dims
AvgPoolPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
assert
(
nbInputs
==
1
);
assert
(
nbInputs
==
1
);
...
...
paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
浏览文件 @
2c4fcaa6
...
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
...
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
protected:
protected:
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
ceil_mode_
)
+
SerializedSize
(
ksize_
)
+
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
ceil_mode_
)
+
SerializedSize
(
strides_
)
+
SerializedSize
(
paddings_
)
+
SerializedSize
(
ksize_
)
+
SerializedSize
(
strides_
)
+
SerializedSize
(
input_shape_
)
+
getBaseSerializationSize
();
SerializedSize
(
paddings_
)
+
SerializedSize
(
input_shape_
)
+
SerializedSize
(
output_shape_
)
+
getBaseSerializationSize
();
}
}
// TRT will call this func when we need to serialize the configuration of
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// tensorrt.
// It should not be called by users.
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
ceil_mode_
);
SerializeValue
(
&
buffer
,
ceil_mode_
);
SerializeValue
(
&
buffer
,
ksize_
);
SerializeValue
(
&
buffer
,
ksize_
);
SerializeValue
(
&
buffer
,
strides_
);
SerializeValue
(
&
buffer
,
strides_
);
SerializeValue
(
&
buffer
,
paddings_
);
SerializeValue
(
&
buffer
,
paddings_
);
SerializeValue
(
&
buffer
,
input_shape_
);
SerializeValue
(
&
buffer
,
input_shape_
);
SerializeValue
(
&
buffer
,
output_shape_
);
}
}
public:
public:
AvgPoolPlugin
()
{}
AvgPoolPlugin
(
bool
ceil_mode
,
std
::
vector
<
int
>
ksize
,
AvgPoolPlugin
(
bool
ceil_mode
,
std
::
vector
<
int
>
ksize
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
std
::
vector
<
int
>
strides
,
std
::
vector
<
int
>
paddings
,
std
::
vector
<
int
>
input_shape
)
std
::
vector
<
int
>
input_shape
)
...
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
...
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
strides_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
strides_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
paddings_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
paddings_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
input_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
input_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
output_shape_
);
}
}
AvgPoolPlugin
*
clone
()
const
override
{
AvgPoolPlugin
*
clone
()
const
override
{
...
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
...
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
input_shape_
);
input_shape_
);
}
}
const
char
*
getPluginType
()
const
override
{
return
"avg_pool"
;
}
const
char
*
getPluginType
()
const
override
{
return
"avg_pool
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
int
nbInputDims
)
override
;
...
...
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
浏览文件 @
2c4fcaa6
...
@@ -14,12 +14,19 @@ limitations under the License. */
...
@@ -14,12 +14,19 @@ limitations under the License. */
#include <glog/logging.h>
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
ElementWisePlugin
*
CreateElementWisePluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
ElementWisePlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"elementwise_plugin"
,
CreateElementWisePluginDeserialize
);
namespace
details
{
namespace
details
{
template
<
typename
T
>
template
<
typename
T
>
...
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
...
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
const
float
*
y
=
reinterpret_cast
<
const
float
*>
(
inputs
[
1
]);
const
float
*
y
=
reinterpret_cast
<
const
float
*>
(
inputs
[
1
]);
float
*
out
=
reinterpret_cast
<
float
*>
(
outputs
[
0
]);
float
*
out
=
reinterpret_cast
<
float
*>
(
outputs
[
0
]);
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kSUM
)
{
if
(
type_
==
"add"
)
{
details
::
ElementWise
(
details
::
Add
<
float
>
(),
x
,
y
,
out
,
batch_size
,
details
::
ElementWise
(
details
::
Add
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
if
(
type_
==
nvinfer1
::
ElementWiseOperation
::
kPROD
)
{
}
else
if
(
type_
==
"mul"
)
{
details
::
ElementWise
(
details
::
Mul
<
float
>
(),
x
,
y
,
out
,
batch_size
,
details
::
ElementWise
(
details
::
Mul
<
float
>
(),
x
,
y
,
out
,
batch_size
,
prev_size_
,
midd_size_
,
post_size_
,
stream
);
prev_size_
,
midd_size_
,
post_size_
,
stream
);
}
else
{
}
else
{
...
...
paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
浏览文件 @
2c4fcaa6
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
@@ -24,9 +25,8 @@ namespace plugin {
...
@@ -24,9 +25,8 @@ namespace plugin {
class
ElementWisePlugin
:
public
PluginTensorRT
{
class
ElementWisePlugin
:
public
PluginTensorRT
{
public:
public:
ElementWisePlugin
(
nvinfer1
::
ElementWiseOperation
type
,
ElementWisePlugin
(
std
::
string
type
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_x
,
nvinfer1
::
Dims
const
&
dims_y
,
nvinfer1
::
Dims
const
&
dims_y
,
int
axis
)
int
axis
)
:
type_
(
type
),
:
type_
(
type
),
dims_x_
(
dims_x
),
dims_x_
(
dims_x
),
dims_y_
(
dims_y
),
dims_y_
(
dims_y
),
...
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
...
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
ElementWisePlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
ElementWisePlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
deserializeBase
(
serial_data
,
serial_length
);
const
char
*
elementwise_type
;
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
elementwise_type
);
type_
=
std
::
string
(
elementwise_type
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
axis_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_x_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_x_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_y_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
dims_y_
);
...
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
...
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
return
nullptr
;
return
nullptr
;
}
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise"
;
}
const
char
*
getPluginType
()
const
override
{
return
"elementwise
_plugin
"
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
const
nvinfer1
::
Dims
*
input_dims
,
...
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
...
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
protected:
protected:
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_x_
)
+
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
axis_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
SerializedSize
(
dims_x_
)
+
SerializedSize
(
dims_y_
)
+
getBaseSerializationSize
();
}
}
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
type_
.
c_str
());
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
dims_x_
);
SerializeValue
(
&
buffer
,
dims_x_
);
SerializeValue
(
&
buffer
,
dims_y_
);
SerializeValue
(
&
buffer
,
dims_y_
);
}
}
nvinfer1
::
ElementWiseOperation
type_
;
std
::
string
type_
;
nvinfer1
::
Dims
dims_x_
;
nvinfer1
::
Dims
dims_x_
;
nvinfer1
::
Dims
dims_y_
;
nvinfer1
::
Dims
dims_y_
;
int
axis_
;
int
axis_
;
...
...
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
浏览文件 @
2c4fcaa6
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <vector>
#include <vector>
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/prelu.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -24,6 +25,17 @@ namespace inference {
...
@@ -24,6 +25,17 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
PReluPlugin
*
CreatePreluPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
PReluPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"prelu_plugin"
,
CreatePreluPluginDeserialize
);
int
PReluPlugin
::
initialize
()
{
cudaMalloc
(
&
p_gpu_weight_
,
sizeof
(
float
)
*
weight_
.
size
());
cudaMemcpy
(
p_gpu_weight_
,
weight_
.
data
(),
weight_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
nvinfer1
::
Dims
PReluPlugin
::
getOutputDimensions
(
int
index
,
nvinfer1
::
Dims
PReluPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
{
int
nbInputs
)
{
...
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
...
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
// input dims is CHW.
// input dims is CHW.
const
auto
&
input_dims
=
this
->
getInputDims
(
0
);
const
auto
&
input_dims
=
this
->
getInputDims
(
0
);
const
float
*
input
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
const
float
*
input
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
const
float
*
alpha
=
reinterpret_cast
<
const
float
*>
(
alpha_
.
get
().
values
);
// const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
const
float
*
alpha
=
p_gpu_weight_
;
float
*
output
=
reinterpret_cast
<
float
**>
(
outputs
)[
0
];
float
*
output
=
reinterpret_cast
<
float
**>
(
outputs
)[
0
];
std
::
vector
<
int
>
input_shape
;
std
::
vector
<
int
>
input_shape
;
...
...
paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
浏览文件 @
2c4fcaa6
...
@@ -14,7 +14,12 @@
...
@@ -14,7 +14,12 @@
#pragma once
#pragma once
#include <algorithm>
#include <string>
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
@@ -24,39 +29,51 @@ namespace tensorrt {
...
@@ -24,39 +29,51 @@ namespace tensorrt {
namespace
plugin
{
namespace
plugin
{
class
PReluPlugin
:
public
PluginTensorRT
{
class
PReluPlugin
:
public
PluginTensorRT
{
TensorRTEngine
::
Weight
alpha_
;
std
::
vector
<
float
>
weight_
;
float
*
p_gpu_weight_
;
std
::
string
mode_
;
std
::
string
mode_
;
protected:
protected:
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
// return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
return
getBaseSerializationSize
()
+
SerializedSize
(
mode_
.
c_str
())
+
return
0
;
SerializedSize
(
weight_
)
+
SerializedSize
(
getPluginType
())
;
}
}
// TRT will call this func when we need to serialize the configuration of
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// tensorrt.
// It should not be called by users.
// It should not be called by users.
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
// serializeBase(buffer);
SerializeValue
(
&
buffer
,
getPluginType
());
// SerializeValue(&buffer, alpha_);
serializeBase
(
buffer
);
// SerializeValue(&buffer, mode_);
SerializeValue
(
&
buffer
,
weight_
);
SerializeValue
(
&
buffer
,
mode_
.
c_str
());
}
}
public:
public:
PReluPlugin
(
TensorRTEngine
::
Weight
const
&
alpha
,
std
::
string
const
&
mode
)
PReluPlugin
(
const
float
*
weight
,
const
int
weight_num
,
:
alpha_
(
alpha
),
mode_
(
mode
)
{}
std
::
string
const
&
mode
)
:
mode_
(
mode
)
{
weight_
.
resize
(
weight_num
);
std
::
copy
(
weight
,
weight
+
weight_num
,
weight_
.
data
());
}
// It was used for tensorrt deserialization.
// It was used for tensorrt deserialization.
// It should not be called by users.
// It should not be called by users.
PReluPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
PReluPlugin
(
void
const
*
serialData
,
size_t
serialLength
)
{
// deserializeBase(serialData, serialLength);
deserializeBase
(
serialData
,
serialLength
);
// DeserializeValue(&serialData, &serialLength, &alpha_);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
weight_
);
// DeserializeValue(&serialData, &serialLength, &mode_);
const
char
*
prelu_mode
;
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
prelu_mode
);
mode_
=
std
::
string
(
prelu_mode
);
}
}
~
PReluPlugin
()
{
cudaFree
(
p_gpu_weight_
);
}
int
initialize
()
override
;
PReluPlugin
*
clone
()
const
override
{
return
new
PReluPlugin
(
alpha_
,
mode_
);
}
PReluPlugin
*
clone
()
const
override
{
return
new
PReluPlugin
(
weight_
.
data
(),
weight_
.
size
(),
mode_
);
}
const
char
*
getPluginType
()
const
override
{
return
"prelu"
;
}
const
char
*
getPluginType
()
const
override
{
return
"prelu
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nbInputDims
)
override
;
int
nbInputDims
)
override
;
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
2c4fcaa6
...
@@ -15,12 +15,18 @@
...
@@ -15,12 +15,18 @@
#include <cuda_fp16.h>
#include <cuda_fp16.h>
#include <algorithm>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
SplitPlugin
*
CreateSplitPluginDeserialize
(
const
void
*
buffer
,
size_t
length
)
{
return
new
SplitPlugin
(
buffer
,
length
);
}
REGISTER_TRT_PLUGIN
(
"split_plugin"
,
CreateSplitPluginDeserialize
);
// copied from operators::math::SplitFunctor
// copied from operators::math::SplitFunctor
template
<
typename
T
>
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
2c4fcaa6
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#pragma once
#pragma once
#include <thrust/device_vector.h>
#include <thrust/device_vector.h>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
@@ -25,6 +26,7 @@ namespace plugin {
...
@@ -25,6 +26,7 @@ namespace plugin {
class
SplitPlugin
:
public
PluginTensorRT
{
class
SplitPlugin
:
public
PluginTensorRT
{
public:
public:
SplitPlugin
()
{}
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
...
@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
...
@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
return
new
SplitPlugin
(
axis_
,
output_length_
);
return
new
SplitPlugin
(
axis_
,
output_length_
);
}
}
const
char
*
getPluginType
()
const
override
{
return
"split"
;
}
const
char
*
getPluginType
()
const
override
{
return
"split
_plugin
"
;
}
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
int
getNbOutputs
()
const
override
{
return
output_length_
.
size
();
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
const
nvinfer1
::
Dims
*
input_dims
,
...
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
...
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
protected:
protected:
size_t
getSerializationSize
()
override
{
size_t
getSerializationSize
()
override
{
return
SerializedSize
(
axis_
)
+
SerializedSize
(
output_length
_
)
+
return
SerializedSize
(
getPluginType
())
+
SerializedSize
(
axis
_
)
+
getBaseSerializationSize
();
SerializedSize
(
output_length_
)
+
getBaseSerializationSize
();
}
}
void
serialize
(
void
*
buffer
)
override
{
void
serialize
(
void
*
buffer
)
override
{
SerializeValue
(
&
buffer
,
getPluginType
());
serializeBase
(
buffer
);
serializeBase
(
buffer
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
axis_
);
SerializeValue
(
&
buffer
,
output_length_
);
SerializeValue
(
&
buffer
,
output_length_
);
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
浏览文件 @
2c4fcaa6
...
@@ -17,9 +17,10 @@
...
@@ -17,9 +17,10 @@
#include <NvInfer.h>
#include <NvInfer.h>
#include <cstring>
#include <cstring>
#include <unordered_map>
#include <unordered_map>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/
serialize
.h"
#include "paddle/fluid/inference/tensorrt/plugin/
trt_plugin_utils
.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -30,6 +31,13 @@ namespace inference {
...
@@ -30,6 +31,13 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
class
PluginTensorRT
;
typedef
std
::
function
<
PluginTensorRT
*
(
const
void
*
,
size_t
)
>
PluginDeserializeFunc
;
typedef
std
::
function
<
PluginTensorRT
*
(
void
)
>
PluginConstructFunc
;
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
class
PluginTensorRT
:
public
nvinfer1
::
IPluginExt
{
public:
public:
PluginTensorRT
()
{}
PluginTensorRT
()
{}
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
0 → 100644
浏览文件 @
2c4fcaa6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
PluginTensorRT
*
PluginFactoryTensorRT
::
createPlugin
(
const
char
*
layer_name
,
const
void
*
serial_data
,
size_t
serial_length
)
{
const
char
*
plugin_type
;
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
plugin_type
);
PADDLE_ENFORCE
(
Has
(
plugin_type
),
"trt plugin type %s does not exists, check it."
,
plugin_type
);
auto
plugin
=
plugin_registry_
[
plugin_type
](
serial_data
,
serial_length
);
owned_plugins_
.
emplace_back
(
plugin
);
return
plugin
;
}
bool
PluginFactoryTensorRT
::
RegisterPlugin
(
const
std
::
string
&
op_name
,
PluginDeserializeFunc
deserialize_func
)
{
if
(
Has
(
op_name
))
return
false
;
auto
ret
=
plugin_registry_
.
emplace
(
op_name
,
deserialize_func
);
return
ret
.
second
;
}
void
PluginFactoryTensorRT
::
DestroyPlugins
()
{
owned_plugins_
.
clear
();
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
0 → 100644
浏览文件 @
2c4fcaa6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <NvInfer.h>
#include <cstring>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
namespace
plugin
{
class
PluginFactoryTensorRT
:
public
nvinfer1
::
IPluginFactory
,
public
DeleteHelper
{
public:
// Deserialization method
PluginTensorRT
*
createPlugin
(
const
char
*
layer_name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
;
bool
RegisterPlugin
(
const
std
::
string
&
op_name
,
PluginDeserializeFunc
deserialize_func
);
bool
Has
(
const
std
::
string
&
op_name
)
{
return
plugin_registry_
.
find
(
op_name
)
!=
plugin_registry_
.
end
();
}
void
DestroyPlugins
();
protected:
std
::
unordered_map
<
std
::
string
,
PluginDeserializeFunc
>
plugin_registry_
;
std
::
list
<
std
::
unique_ptr
<
PluginTensorRT
>>
owned_plugins_
;
};
class
TrtPluginRegistrar
{
public:
TrtPluginRegistrar
(
const
std
::
string
&
name
,
PluginDeserializeFunc
deserialize_func
)
{
inference
::
Singleton
<
PluginFactoryTensorRT
>::
Global
().
RegisterPlugin
(
name
,
deserialize_func
);
}
};
#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \
trt_plugin_registrar##ctr __attribute__((unused)) = \
paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
name, deserialize_func)
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/plugin/
serialize
.h
→
paddle/fluid/inference/tensorrt/plugin/
trt_plugin_utils
.h
浏览文件 @
2c4fcaa6
...
@@ -13,8 +13,8 @@
...
@@ -13,8 +13,8 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <cstring>
#include <cstring>
#include <string>
#include <type_traits>
#include <type_traits>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -24,6 +24,13 @@ namespace inference {
...
@@ -24,6 +24,13 @@ namespace inference {
namespace
tensorrt
{
namespace
tensorrt
{
namespace
plugin
{
namespace
plugin
{
// Some trt base classes lack of the destructor.
// We use a assisted class to fix this.
struct
DeleteHelper
{
protected:
virtual
~
DeleteHelper
()
{}
};
template
<
typename
T
>
template
<
typename
T
>
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
inline
void
SerializeValue
(
void
**
buffer
,
T
const
&
value
);
...
...
paddle/fluid/inference/tensorrt/test_engine.cc
浏览文件 @
2c4fcaa6
...
@@ -17,6 +17,8 @@ limitations under the License. */
...
@@ -17,6 +17,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -27,19 +29,34 @@ namespace tensorrt {
...
@@ -27,19 +29,34 @@ namespace tensorrt {
class
TensorRTEngineTest
:
public
::
testing
::
Test
{
class
TensorRTEngineTest
:
public
::
testing
::
Test
{
protected:
protected:
void
SetUp
()
override
{
void
SetUp
()
override
{
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream_
));
ctx_
=
new
platform
::
CUDADeviceContext
(
platform
::
CUDAPlace
(
0
));
engine_
=
new
TensorRTEngine
(
10
,
1
<<
10
,
stream_
);
engine_
=
new
TensorRTEngine
(
10
,
1
<<
10
);
engine_
->
InitNetwork
();
engine_
->
InitNetwork
();
}
}
void
TearDown
()
override
{
void
TearDown
()
override
{
delete
engine_
;
if
(
engine_
)
{
cudaStreamDestroy
(
stream_
);
delete
engine_
;
engine_
=
nullptr
;
}
}
void
PrepareInputOutput
(
const
std
::
vector
<
float
>
&
input
,
std
::
vector
<
int
>
output_shape
)
{
TensorFromVector
(
input
,
*
ctx_
,
&
input_
);
output_
.
Resize
(
framework
::
make_ddim
(
output_shape
));
}
void
GetOutput
(
std
::
vector
<
float
>
*
output
)
{
TensorToVector
(
output_
,
*
ctx_
,
output
);
}
}
protected:
protected:
TensorRTEngine
*
engine_
;
framework
::
Tensor
input_
;
cudaStream_t
stream_
;
framework
::
Tensor
output_
;
TensorRTEngine
*
engine_
;
platform
::
CUDADeviceContext
*
ctx_
;
};
};
TEST_F
(
TensorRTEngineTest
,
add_layer
)
{
TEST_F
(
TensorRTEngineTest
,
add_layer
)
{
...
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
...
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
float
raw_weight
[
size
]
=
{
2.
};
// Weight in CPU memory.
float
raw_weight
[
size
]
=
{
2.
};
// Weight in CPU memory.
float
raw_bias
[
size
]
=
{
3.
};
float
raw_bias
[
size
]
=
{
3.
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
LOG
(
INFO
)
<<
"create weights"
;
LOG
(
INFO
)
<<
"create weights"
;
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
size
);
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
size
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
size
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
size
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
1
,
1
});
nvinfer1
::
DimsCHW
{
1
,
1
,
1
});
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
size
,
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
size
,
weight
.
get
(),
bias
.
get
());
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
...
@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
...
@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
// fill in real data
// fill in real data
float
x_v
=
1234
;
std
::
vector
<
float
>
x_v
=
{
1234
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
std
::
vector
<
float
>
y_cpu
;
1
*
sizeof
(
float
));
PrepareInputOutput
(
x_v
,
{
1
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
LOG
(
INFO
)
<<
"to execute"
;
LOG
(
INFO
)
<<
"to execute"
;
engine_
->
Execute
(
1
);
engine_
->
Execute
(
1
,
&
buffers
,
ctx_
->
stream
()
);
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
;
GetOutput
(
&
y_cpu
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
,
1
*
sizeof
(
float
));
LOG
(
INFO
)
<<
"to checkout output"
;
LOG
(
INFO
)
<<
"to checkout output"
;
ASSERT_EQ
(
y_cpu
,
x_v
*
2
+
3
);
ASSERT_EQ
(
y_cpu
[
0
],
x_v
[
0
]
*
2
+
3
);
}
}
TEST_F
(
TensorRTEngineTest
,
add_layer_multi_dim
)
{
TEST_F
(
TensorRTEngineTest
,
add_layer_multi_dim
)
{
...
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
...
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
// instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
// instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
float
raw_weight
[
4
]
=
{
1.0
,
1.1
,
3.3
,
4.4
};
float
raw_weight
[
4
]
=
{
1.0
,
1.1
,
3.3
,
4.4
};
float
raw_bias
[
2
]
=
{
1.3
,
2.4
};
float
raw_bias
[
2
]
=
{
1.3
,
2.4
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
4
);
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
4
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
2
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
2
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
2
,
1
});
nvinfer1
::
DimsCHW
{
1
,
2
,
1
});
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
2
,
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
x
,
2
,
weight
.
get
(),
bias
.
get
());
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
PADDLE_ENFORCE
(
fc_layer
!=
nullptr
);
...
@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
...
@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
engine_
->
FreezeNetwork
();
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
2
]
=
{
1.0
,
2.0
};
// fill in real data
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
std
::
vector
<
float
>
x_v
=
{
1.0
,
2.0
};
2
*
sizeof
(
float
));
std
::
vector
<
float
>
y_cpu
;
engine_
->
Execute
(
1
);
PrepareInputOutput
(
x_v
,
{
2
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
1
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
[
2
]
=
{
-
1.
,
-
1.
}
;
GetOutput
(
&
y_cpu
)
;
auto
dims
=
engine_
->
GetITensor
(
"y"
)
->
getDimensions
();
auto
dims
=
engine_
->
GetITensor
(
"y"
)
->
getDimensions
();
ASSERT_EQ
(
dims
.
nbDims
,
3
);
ASSERT_EQ
(
dims
.
nbDims
,
3
);
ASSERT_EQ
(
dims
.
d
[
0
],
2
);
ASSERT_EQ
(
dims
.
d
[
0
],
2
);
ASSERT_EQ
(
dims
.
d
[
1
],
1
);
ASSERT_EQ
(
dims
.
d
[
1
],
1
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
2
*
sizeof
(
float
));
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
}
}
...
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
...
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
// Weight in CPU memory.
// Weight in CPU memory.
float
raw_weight
[
9
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
float
raw_weight
[
9
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
float
raw_bias
[
1
]
=
{
0
};
float
raw_bias
[
1
]
=
{
0
};
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
9
);
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
9
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
1
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
1
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
3
,
3
});
nvinfer1
::
Dims3
{
1
,
3
,
3
});
auto
*
conv_layer
=
auto
*
conv_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
x
,
1
,
nvinfer1
::
DimsHW
{
3
,
3
},
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
x
,
1
,
nvinfer1
::
DimsHW
{
3
,
3
},
weight
.
get
(),
bias
.
get
());
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
conv_layer
!=
nullptr
);
PADDLE_ENFORCE
(
conv_layer
!=
nullptr
);
...
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
...
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
engine_
->
FreezeNetwork
();
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
18
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
// fill in real data
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
std
::
vector
<
float
>
x_v
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
18
*
sizeof
(
float
));
std
::
vector
<
float
>
y_cpu
;
engine_
->
Execute
(
2
);
PrepareInputOutput
(
x_v
,
{
18
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
2
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
18
]
;
GetOutput
(
&
y_cpu
)
;
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
18
*
sizeof
(
float
));
ASSERT_EQ
(
y_cpu
[
0
],
4.0
);
ASSERT_EQ
(
y_cpu
[
0
],
4.0
);
ASSERT_EQ
(
y_cpu
[
1
],
6.0
);
ASSERT_EQ
(
y_cpu
[
1
],
6.0
);
}
}
TEST_F
(
TensorRTEngineTest
,
test_pool2d
)
{
TEST_F
(
TensorRTEngineTest
,
test_pool2d
)
{
// Weight in CPU memory.
// Weight in CPU memory.
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
2
,
2
});
nvinfer1
::
Dims3
{
1
,
2
,
2
});
std
::
vector
<
void
*>
buffers
(
2
);
// TRT binded inputs
nvinfer1
::
PoolingType
pool_t
=
nvinfer1
::
PoolingType
::
kAVERAGE
;
nvinfer1
::
PoolingType
pool_t
=
nvinfer1
::
PoolingType
::
kAVERAGE
;
auto
*
pool_layer
=
auto
*
pool_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
x
,
pool_t
,
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
x
),
nvinfer1
::
DimsHW
{
2
,
2
});
pool_t
,
nvinfer1
::
DimsHW
{
2
,
2
});
PADDLE_ENFORCE
(
pool_layer
!=
nullptr
);
PADDLE_ENFORCE
(
pool_layer
!=
nullptr
);
pool_layer
->
setStride
(
nvinfer1
::
DimsHW
{
1
,
1
});
pool_layer
->
setStride
(
nvinfer1
::
DimsHW
{
1
,
1
});
...
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
...
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
engine_
->
FreezeNetwork
();
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
8
]
=
{
1.0
,
2.0
,
5.0
,
0.0
,
2.0
,
3.0
,
5.0
,
10.0
};
// fill in real data
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
std
::
vector
<
float
>
x_v
=
{
1.0
,
2.0
,
5.0
,
0.0
,
2.0
,
3.0
,
5.0
,
10.0
};
8
*
sizeof
(
float
));
std
::
vector
<
float
>
y_cpu
;
engine_
->
Execute
(
2
);
PrepareInputOutput
(
x_v
,
{
2
});
auto
*
x_v_gpu_data
=
input_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
auto
*
y_gpu_data
=
output_
.
mutable_data
<
float
>
(
ctx_
->
GetPlace
());
buffers
[
0
]
=
reinterpret_cast
<
void
*>
(
x_v_gpu_data
);
buffers
[
1
]
=
reinterpret_cast
<
void
*>
(
y_gpu_data
);
engine_
->
Execute
(
2
,
&
buffers
,
ctx_
->
stream
());
LOG
(
INFO
)
<<
"to get output"
;
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
2
];
GetOutput
(
&
y_cpu
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
2
*
sizeof
(
float
));
ASSERT_EQ
(
y_cpu
[
0
],
2.0
);
ASSERT_EQ
(
y_cpu
[
0
],
2.0
);
ASSERT_EQ
(
y_cpu
[
1
],
5.0
);
ASSERT_EQ
(
y_cpu
[
1
],
5.0
);
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
2c4fcaa6
...
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
...
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
#define NEW_TENSOR(name__) \
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
for
(
int
tid
=
1
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
double
total_time_of_threads
{
0
};
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
&
,
tid
]
{
threads
.
emplace_back
([
&
,
tid
]
{
// To ensure the thread binding correctly,
auto
&
predictor
=
predictors
[
tid
];
// please clone inside the threadpool.
auto
predictor
=
base_predictor
->
Clone
();
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
data_lod_attention
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
cell_init
);
NEW_TENSOR
(
data
);
NEW_TENSOR
(
data
);
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
浏览文件 @
2c4fcaa6
...
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
...
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
SetConfig
(
&
config
);
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
false
);
config
.
SwitchUseFeedFetchOps
(
false
);
auto
base_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
));
for
(
int
tid
=
1
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
double
total_time_of_threads
{
0
};
double
total_time_of_threads
{
0
};
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
for
(
int
tid
=
0
;
tid
<
FLAGS_num_threads
;
tid
++
)
{
threads
.
emplace_back
([
&
,
tid
]
{
threads
.
emplace_back
([
&
,
tid
]
{
// To ensure the thread binding correctly,
auto
&
predictor
=
predictors
[
tid
];
// please clone inside the threadpool.
auto
predictor
=
base_predictor
->
Clone
();
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
inputs
;
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
inputs
;
PrepareZeroCopyInputs
(
predictor
,
&
inputs
);
PrepareZeroCopyInputs
(
predictor
,
&
inputs
);
auto
output_tensor
=
predictor
->
GetOutputTensor
(
out_var_name
);
auto
output_tensor
=
predictor
->
GetOutputTensor
(
out_var_name
);
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
2c4fcaa6
...
@@ -17,8 +17,10 @@
...
@@ -17,8 +17,10 @@
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <algorithm>
#include <memory>
#include <string>
#include <string>
#include <thread> // NOLINT
#include <thread> // NOLINT
#include <unordered_map>
#include <vector>
#include <vector>
#ifdef WITH_GPERFTOOLS
#ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h>
#include <gperftools/profiler.h>
...
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
...
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
int
batch_size
=
FLAGS_batch_size
;
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
int
num_times
=
FLAGS_repeat
;
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
thread
>
threads
;
auto
main_predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreateTestPredictor
(
config
,
use_analysis
));
for
(
int
tid
=
1
;
tid
<
num_threads
;
tid
++
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
size_t
total_time
{
0
};
size_t
total_time
{
0
};
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
...
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
...
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
// Each thread should have local inputs and outputs.
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
// The inputs of each thread are all the same.
std
::
vector
<
PaddleTensor
>
outputs_tid
;
std
::
vector
<
PaddleTensor
>
outputs_tid
;
// To ensure the thread binding correctly,
auto
&
predictor
=
predictors
[
tid
];
// please clone inside the threadpool.
auto
predictor
=
main_predictor
->
Clone
();
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
if
(
use_analysis
)
{
if
(
use_analysis
)
{
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
2c4fcaa6
...
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
...
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
if
(
use_gpu
)
{
if
(
use_gpu
)
{
config
->
EnableUseGpu
(
100
,
0
);
config
->
EnableUseGpu
(
100
,
0
);
if
(
use_tensorrt
)
{
if
(
use_tensorrt
)
{
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
);
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
,
3
,
AnalysisConfig
::
Precision
::
kFloat32
,
false
);
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
config
->
pass_builder
()
->
TurnOnDebug
();
config
->
pass_builder
()
->
TurnOnDebug
();
...
...
paddle/fluid/memory/allocation/allocator.cc
浏览文件 @
2c4fcaa6
...
@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
...
@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
bool
Allocator
::
IsAllocThreadSafe
()
const
{
return
false
;
}
bool
Allocator
::
IsAllocThreadSafe
()
const
{
return
false
;
}
AllocationPtr
Allocator
::
Allocate
(
size_t
size
,
Allocator
::
Attr
attr
)
{
AllocationPtr
Allocator
::
Allocate
(
size_t
size
,
Allocator
::
Attr
attr
)
{
VLOG
(
2
)
<<
"Alloc allocation on "
<<
typeid
(
*
this
).
name
();
auto
ptr
=
AllocateImpl
(
size
,
attr
);
auto
ptr
=
AllocateImpl
(
size
,
attr
);
ptr
->
RegisterAllocatorChain
(
this
);
ptr
->
RegisterAllocatorChain
(
this
);
VLOG
(
2
)
<<
"Alloc success"
;
return
AllocationPtr
(
ptr
);
return
AllocationPtr
(
ptr
);
}
}
void
Allocator
::
FreeImpl
(
Allocation
*
allocation
)
{
void
Allocator
::
FreeImpl
(
Allocation
*
allocation
)
{
auto
*
allocator
=
allocation
->
TopAllocator
();
Allocator
*
allocator
=
allocation
->
TopAllocator
();
allocator
->
Free
(
allocation
);
allocator
->
Free
(
allocation
);
}
}
void
Allocator
::
Free
(
Allocation
*
allocation
)
{
void
Allocator
::
Free
(
Allocation
*
allocation
)
{
VLOG
(
2
)
<<
"Free allocation on "
<<
typeid
(
*
this
).
name
();
allocation
->
PopAllocator
();
allocation
->
PopAllocator
();
FreeImpl
(
allocation
);
FreeImpl
(
allocation
);
}
}
...
@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
...
@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
const
char
*
BadAlloc
::
what
()
const
noexcept
{
return
msg_
.
c_str
();
}
const
char
*
BadAlloc
::
what
()
const
noexcept
{
return
msg_
.
c_str
();
}
void
AllocationDeleter
::
operator
()(
Allocation
*
allocation
)
const
{
void
AllocationDeleter
::
operator
()(
Allocation
*
allocation
)
const
{
auto
*
allocator
=
allocation
->
TopAllocator
();
Allocator
*
allocator
=
allocation
->
TopAllocator
();
allocator
->
Free
(
allocation
);
allocator
->
Free
(
allocation
);
}
}
...
...
paddle/fluid/memory/allocation/allocator.h
浏览文件 @
2c4fcaa6
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/
small
_stack.h"
#include "paddle/fluid/framework/
inlined
_stack.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -82,7 +82,7 @@ class Allocation {
...
@@ -82,7 +82,7 @@ class Allocation {
std
::
vector
<
Allocator
*>
GetAllocatorChain
()
const
{
std
::
vector
<
Allocator
*>
GetAllocatorChain
()
const
{
std
::
vector
<
Allocator
*>
allocators
;
std
::
vector
<
Allocator
*>
allocators
;
for
(
size_t
i
=
0
;
i
<
allocator_chain_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
allocator_chain_
.
size
();
++
i
)
{
allocators
[
i
]
=
allocator_chain_
[
i
]
;
allocators
.
push_back
(
allocator_chain_
[
i
])
;
}
}
return
allocators
;
return
allocators
;
}
}
...
@@ -100,7 +100,7 @@ class Allocation {
...
@@ -100,7 +100,7 @@ class Allocation {
void
*
ptr_
;
void
*
ptr_
;
size_t
size_
;
size_t
size_
;
platform
::
Place
place_
;
platform
::
Place
place_
;
framework
::
Small
Stack
<
Allocator
*
,
8
>
allocator_chain_
;
framework
::
Inlined
Stack
<
Allocator
*
,
8
>
allocator_chain_
;
friend
class
Allocator
;
friend
class
Allocator
;
friend
class
AllocationDeleter
;
friend
class
AllocationDeleter
;
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
2c4fcaa6
...
@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
...
@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"that initializing the allocated memory with a small value "
"during unit testing."
);
"during unit testing."
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
initial_gpu_memory_in_mb
);
DECLARE_double
(
reallocate_gpu_memory_in_mb
);
DECLARE_bool
(
benchmark
);
DECLARE_bool
(
benchmark
);
namespace
paddle
{
namespace
paddle
{
...
@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
...
@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
std
::
call_once
(
init_flag
,
[]()
{
a
=
new
detail
::
BuddyAllocator
(
a
=
new
detail
::
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
platform
::
CpuMinChunkSize
(),
platform
::
CpuMaxChunkSize
());
platform
::
CpuMinChunkSize
(),
platform
::
CpuMaxChunkSize
(),
platform
::
CpuMaxChunkSize
());
});
});
return
a
;
return
a
;
...
@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
...
@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
}
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
class
GPUBuddyAllocatorList
{
static
std
::
once_flag
init_flag
;
public:
static
detail
::
BuddyAllocator
**
a_arr
=
nullptr
;
GPUBuddyAllocatorList
()
static
std
::
vector
<
int
>
devices
;
:
allocators_
(
platform
::
GetCUDADeviceCount
()),
flags_
(
platform
::
GetCUDADeviceCount
())
{
std
::
call_once
(
init_flag
,
[
gpu_id
]()
{
allocation
::
GPUMemMonitor
.
Initialize
(
allocators_
.
size
());
devices
=
platform
::
GetSelectedDevices
();
}
int
gpu_num
=
devices
.
size
();
allocation
::
GPUMemMonitor
.
Initialize
(
devices
.
size
());
a_arr
=
new
BuddyAllocator
*
[
gpu_num
];
BuddyAllocator
*
Get
(
size_t
dev_id
)
{
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
dev_id
<
flags_
.
size
(),
"Invalid device id %s"
,
dev_id
);
int
dev_id
=
devices
[
i
];
std
::
call_once
(
flags_
[
dev_id
],
[
this
,
dev_id
]
{
a_arr
[
i
]
=
nullptr
;
platform
::
SetDeviceId
(
dev_id
);
platform
::
SetDeviceId
(
dev_id
);
a_arr
[
i
]
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
size_t
first_size
=
platform
::
GpuFirstAllocateChunkSize
();
new
detail
::
GPUAllocator
(
dev_id
)),
size_t
re_size
=
platform
::
GpuReAllocateChunkSize
();
platform
::
GpuMinChunkSize
(),
allocators_
[
dev_id
]
=
platform
::
GpuMaxChunkSize
());
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
GPUAllocator
(
dev_id
)),
VLOG
(
10
)
<<
"
\n\n
NOTE: each GPU device use "
platform
::
GpuMinChunkSize
(),
first_size
,
re_size
);
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
VLOG
(
2
)
<<
"
\n\n
NOTE: each GPU device use "
<<
"% of GPU memory.
\n
"
<<
string
::
HumanReadableSize
(
first_size
)
<<
"(initial chunk) "
<<
"You can set GFlags environment variable '"
<<
string
::
HumanReadableSize
(
re_size
)
<<
"(reallocate chunk) "
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"% of GPU memory.
\n
"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
<<
"You can set GFlags environment variable '"
}
<<
"FLAGS_fraction_of_gpu_memory_to_use"
});
<<
"' or "
"'FLAGS_initial_gpu_memory_in_mb/"
"FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
"of GPU usage.
\n\n
"
;
VLOG
(
2
)
<<
"Currently, FLAGS_fraction_of_gpu_memory_to_use="
<<
FLAGS_fraction_of_gpu_memory_to_use
<<
", "
<<
"FLAGS_initial_gpu_memory_in_mb="
<<
FLAGS_initial_gpu_memory_in_mb
<<
", "
<<
"FLAGS_reallocate_gpu_memory_in_mb="
<<
FLAGS_reallocate_gpu_memory_in_mb
;
});
return
allocators_
[
dev_id
];
}
private:
std
::
vector
<
BuddyAllocator
*>
allocators_
;
std
::
vector
<
std
::
once_flag
>
flags_
;
};
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
static
GPUBuddyAllocatorList
allocators
;
platform
::
SetDeviceId
(
gpu_id
);
platform
::
SetDeviceId
(
gpu_id
);
auto
pos
=
std
::
distance
(
devices
.
begin
(),
return
allocators
.
Get
(
gpu_id
);
std
::
find
(
devices
.
begin
(),
devices
.
end
(),
gpu_id
));
return
a_arr
[
pos
];
}
}
#endif
#endif
...
@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
...
@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
if
(
ptr
==
nullptr
&&
size
>
0
)
{
int
cur_dev
=
platform
::
GetCurrentDeviceId
();
int
cur_dev
=
platform
::
GetCurrentDeviceId
();
platform
::
SetDeviceId
(
place
.
device
);
platform
::
SetDeviceId
(
place
.
device
);
size_t
avail
,
total
;
size_t
avail
,
total
;
...
@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
...
@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CUDAPinnedAllocator
),
new
detail
::
CUDAPinnedAllocator
),
platform
::
CUDAPinnedMinChunkSize
(),
platform
::
CUDAPinnedMinChunkSize
(),
platform
::
CUDAPinnedMaxChunkSize
(),
platform
::
CUDAPinnedMaxChunkSize
());
platform
::
CUDAPinnedMaxChunkSize
());
});
});
...
...
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
浏览文件 @
2c4fcaa6
...
@@ -14,16 +14,90 @@
...
@@ -14,16 +14,90 @@
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include <algorithm>
#include <algorithm>
#include <cctype>
#include <fstream>
#include <limits>
#include <limits>
#include <sstream>
#include <string>
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double
(
tolerant_times
,
2
,
DEFINE_double
(
tolerant_times
,
2
,
"Tolerant memory size times of buffered_allocator"
);
"Tolerant memory size times of buffered_allocator"
);
DEFINE_string
(
division_plan_path
,
""
,
"Division plan file path"
);
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
std
::
string
TrimStringAndToLowerCase
(
const
std
::
string
&
str
)
{
auto
not_space
=
[](
char
ch
)
{
return
std
::
isspace
(
ch
)
==
0
;
};
auto
first_idx
=
static_cast
<
size_t
>
(
std
::
find_if
(
str
.
begin
(),
str
.
end
(),
not_space
)
-
str
.
begin
());
auto
last_idx
=
static_cast
<
size_t
>
(
std
::
find_if
(
str
.
rbegin
(),
str
.
rend
(),
not_space
)
-
str
.
rbegin
());
if
(
first_idx
==
str
.
size
()
||
last_idx
==
str
.
size
())
return
""
;
last_idx
=
str
.
size
()
-
1
-
last_idx
;
auto
ret
=
str
.
substr
(
first_idx
,
last_idx
-
first_idx
);
std
::
for_each
(
ret
.
begin
(),
ret
.
end
(),
[](
char
&
ch
)
{
ch
=
std
::
tolower
(
ch
);
});
return
ret
;
}
static
size_t
ParseStringToBytes
(
const
std
::
string
&
str
)
{
std
::
string
ret
=
str
;
if
(
ret
.
back
()
==
'b'
)
{
ret
.
pop_back
();
}
PADDLE_ENFORCE
(
!
ret
.
empty
(),
"Wrong format: %s"
,
str
);
size_t
multiples
=
1
;
switch
(
ret
.
back
())
{
case
'g'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
30
);
break
;
case
'm'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
20
);
break
;
case
'k'
:
multiples
*=
(
static_cast
<
size_t
>
(
1
)
<<
10
);
break
;
default:
break
;
}
if
(
multiples
!=
1
)
ret
.
pop_back
();
ret
=
TrimStringAndToLowerCase
(
ret
);
double
ret_val
=
0.0
;
std
::
stringstream
ss
(
ret
);
PADDLE_ENFORCE
((
ss
>>
ret_val
).
good
(),
"Wrong format %s"
,
str
);
return
static_cast
<
size_t
>
(
ret_val
*
multiples
);
}
static
std
::
string
GetDebugStringOfPlan
(
const
std
::
vector
<
size_t
>
&
plan
)
{
std
::
string
ret
(
"["
);
for
(
auto
sz
:
plan
)
{
ret
+=
string
::
HumanReadableSize
(
sz
);
ret
+=
", "
;
}
return
ret
+
"]"
;
}
static
std
::
vector
<
size_t
>
ReadDivisionPlanFromFile
(
const
std
::
string
&
filepath
)
{
std
::
ifstream
is
(
filepath
.
c_str
());
PADDLE_ENFORCE
(
is
.
good
(),
"File not exist"
);
std
::
string
str
;
std
::
vector
<
size_t
>
plan
;
while
(
std
::
getline
(
is
,
str
).
good
())
{
str
=
TrimStringAndToLowerCase
(
str
);
if
(
str
.
empty
())
break
;
plan
.
push_back
(
ParseStringToBytes
(
str
));
}
return
plan
;
}
static
void
CheckAndModifyMemoryDivisionPlan
(
static
void
CheckAndModifyMemoryDivisionPlan
(
std
::
vector
<
size_t
>
*
division_plan
)
{
std
::
vector
<
size_t
>
*
division_plan
)
{
// Check whether the division plan is strictly sorted
// Check whether the division plan is strictly sorted
...
@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
...
@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
}
}
static
std
::
vector
<
size_t
>
GetDefaultDivisionPlan
()
{
static
std
::
vector
<
size_t
>
GetDefaultDivisionPlan
()
{
if
(
!
FLAGS_division_plan_path
.
empty
())
{
return
ReadDivisionPlanFromFile
(
FLAGS_division_plan_path
);
}
constexpr
size_t
kMaxLogSize
=
30
;
std
::
vector
<
size_t
>
plan
;
std
::
vector
<
size_t
>
plan
;
for
(
size_t
i
=
12
;
i
<=
kMaxLogSize
;
++
i
)
{
plan
.
push_back
(
static_cast
<
size_t
>
(
1
)
<<
i
);
}
/*
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
plan.push_back(static_cast<size_t>(1) << i);
}
}
*/
return
plan
;
return
plan
;
}
}
...
@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
...
@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)),
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)),
division_plan_
(
division_plan
)
{
division_plan_
(
division_plan
)
{
CheckAndModifyMemoryDivisionPlan
(
&
division_plan_
);
CheckAndModifyMemoryDivisionPlan
(
&
division_plan_
);
allocations_
.
resize
(
division_plan_
.
size
());
allocations_
.
resize
(
division_plan_
.
size
()
-
1
);
mtx_
.
resize
(
division_plan_
.
size
());
mtx_
.
resize
(
division_plan_
.
size
()
-
1
);
if
(
underlying_allocator_
->
IsAllocThreadSafe
())
{
if
(
underlying_allocator_
->
IsAllocThreadSafe
())
{
for
(
auto
&
mtx
:
mtx_
)
{
for
(
auto
&
mtx
:
mtx_
)
{
mtx
.
reset
(
new
std
::
mutex
());
mtx
.
reset
(
new
std
::
mutex
());
}
}
}
}
VLOG
(
1
)
<<
"Division plan is: "
<<
GetDebugStringOfPlan
(
division_plan_
);
VLOG
(
1
)
<<
"FLAGS_tolerant_times = "
<<
FLAGS_tolerant_times
;
VLOG
(
1
)
<<
"FLAGS_tolerant_times = "
<<
FLAGS_tolerant_times
;
}
}
void
MultiBinBufferedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
void
MultiBinBufferedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
allocation
->
size
());
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
allocation
->
size
());
{
if
(
bin_index
<
allocations_
.
size
())
{
platform
::
LockGuardPtr
<
std
::
mutex
>
guard
(
mtx_
[
bin_index
]);
platform
::
LockGuardPtr
<
std
::
mutex
>
guard
(
mtx_
[
bin_index
]);
allocations_
[
bin_index
].
emplace
(
allocation
->
size
(),
allocations_
[
bin_index
].
emplace
(
allocation
->
size
(),
AllocationPtr
(
allocation
));
AllocationPtr
(
allocation
));
}
else
{
underlying_allocator_
->
Free
(
allocation
);
}
}
}
}
void
MultiBinBufferedAllocator
::
FreeCache
(
size_t
size
,
size_t
bin_index
)
{
// bin_index is not used currently.
// Maybe we can design more flexible FreeCache strategy based on bin_index
size_t
MultiBinBufferedAllocator
::
FreeCache
(
size_t
size
,
size_t
bin_index
)
{
size_t
accumulated_size
=
0
;
size_t
accumulated_size
=
0
;
// FIXME(zjl): free the largest first when there is no extra
// FIXME(zjl): free the largest first when there is no extra
for
(
size_t
i
=
allocations_
.
size
()
-
1
;
i
!=
static_cast
<
size_t
>
(
-
1
);
--
i
)
{
for
(
size_t
i
=
allocations_
.
size
()
-
1
;
i
!=
static_cast
<
size_t
>
(
-
1
);
--
i
)
{
...
@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
...
@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
underlying_allocator_
->
Free
(
it
->
second
.
release
());
underlying_allocator_
->
Free
(
it
->
second
.
release
());
allocations_
[
i
].
erase
(
it
--
);
allocations_
[
i
].
erase
(
it
--
);
if
(
accumulated_size
>=
size
)
{
if
(
accumulated_size
>=
size
)
{
return
;
return
accumulated_size
;
}
}
}
while
(
!
allocations_
[
i
].
empty
());
}
while
(
!
allocations_
[
i
].
empty
());
}
}
return
accumulated_size
;
}
}
Allocation
*
MultiBinBufferedAllocator
::
AllocateImpl
(
size_t
size
,
Attr
attr
)
{
Allocation
*
MultiBinBufferedAllocator
::
AllocateImpl
(
size_t
size
,
Attr
attr
)
{
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
size
);
auto
bin_index
=
FindDivisionPlanBinIndex
(
division_plan_
,
size
);
auto
upper_size
=
TolerantUpperSize
(
size
);
auto
upper_size
=
TolerantUpperSize
(
size
);
for
(;
upper_size
>=
division_plan_
[
bin_index
];
++
bin_index
)
{
// if (bin_index >= allocations_.size()) {
// VLOG(2) << "Allocate " << size << " from underlying directly";
//}
for
(;
bin_index
<
allocations_
.
size
()
&&
upper_size
>=
division_plan_
[
bin_index
];
++
bin_index
)
{
auto
&
allocation
=
allocations_
[
bin_index
];
auto
&
allocation
=
allocations_
[
bin_index
];
platform
::
LockGuardPtr
<
std
::
mutex
>
lock
(
mtx_
[
bin_index
]);
platform
::
LockGuardPtr
<
std
::
mutex
>
lock
(
mtx_
[
bin_index
]);
auto
it
=
allocation
.
lower_bound
(
size
);
auto
it
=
allocation
.
lower_bound
(
size
);
if
(
it
!=
allocation
.
end
()
&&
it
->
second
->
size
()
<
upper_size
)
{
if
(
it
!=
allocation
.
end
()
&&
it
->
second
->
size
()
<=
upper_size
)
{
size_t
sz
=
it
->
second
->
size
();
auto
ret
=
std
::
move
(
it
->
second
);
auto
ret
=
std
::
move
(
it
->
second
);
allocation
.
erase
(
it
);
allocation
.
erase
(
it
);
VLOG
(
3
)
<<
"Allocate "
<<
sz
<<
"(required "
<<
size
<<
") from cache directly"
;
return
ret
.
release
();
return
ret
.
release
();
}
}
}
}
try
{
size_t
retry_time
=
1
;
return
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
while
(
true
)
{
}
catch
(
BadAlloc
&
)
{
try
{
VLOG
(
2
)
<<
"BadAlloc raises, try to free "
<<
size
<<
" caches"
;
auto
ret
=
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
FreeCache
(
size
,
bin_index
);
VLOG
(
2
)
<<
"Allocate "
<<
size
<<
" from underlying directly"
;
return
underlying_allocator_
->
Allocate
(
size
,
attr
).
release
();
return
ret
;
}
catch
(
BadAlloc
&
)
{
VLOG
(
1
)
<<
retry_time
<<
"-th BadAlloc raises, try to free "
<<
size
<<
" bytes caches"
;
// size_t actual_free_size = FreeCache(size, bin_index);
size_t
actual_free_size
=
FreeCache
(
-
1UL
,
bin_index
);
VLOG
(
1
)
<<
retry_time
<<
"-th free "
<<
actual_free_size
<<
" bytes caches"
;
if
(
actual_free_size
==
0
)
throw
;
}
++
retry_time
;
}
}
}
}
...
...
paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
浏览文件 @
2c4fcaa6
...
@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
...
@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
void
FreeImpl
(
Allocation
*
allocation
)
override
;
void
FreeImpl
(
Allocation
*
allocation
)
override
;
private:
private:
void
FreeCache
(
size_t
size
,
size_t
bin_index
);
size_t
FreeCache
(
size_t
size
,
size_t
bin_index
);
std
::
shared_ptr
<
Allocator
>
underlying_allocator_
;
std
::
shared_ptr
<
Allocator
>
underlying_allocator_
;
std
::
vector
<
std
::
multimap
<
size_t
,
AllocationPtr
>>
allocations_
;
std
::
vector
<
std
::
multimap
<
size_t
,
AllocationPtr
>>
allocations_
;
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
2c4fcaa6
...
@@ -25,9 +25,11 @@ namespace detail {
...
@@ -25,9 +25,11 @@ namespace detail {
BuddyAllocator
::
BuddyAllocator
(
BuddyAllocator
::
BuddyAllocator
(
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
size_t
min_chunk_size
,
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
size_t
min_chunk_size
,
size_t
max
_chunk_size
)
size_t
first_allocate_chunk_size
,
size_t
reallocate
_chunk_size
)
:
min_chunk_size_
(
min_chunk_size
),
:
min_chunk_size_
(
min_chunk_size
),
max_chunk_size_
(
max_chunk_size
),
first_allocate_chunk_size_
(
first_allocate_chunk_size
),
reallocate_chunk_size_
(
reallocate_chunk_size
),
max_chunk_size_
(
first_allocate_chunk_size
),
cache_
(
system_allocator
->
UseGpu
()),
cache_
(
system_allocator
->
UseGpu
()),
system_allocator_
(
std
::
move
(
system_allocator
))
{}
system_allocator_
(
std
::
move
(
system_allocator
))
{}
...
@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
...
@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed"
;
"have actually been freed"
;
while
(
!
pool_
.
empty
())
{
while
(
!
pool_
.
empty
())
{
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool_
.
begin
()));
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool_
.
begin
()));
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
auto
desc
=
cache_
.
load
(
block
);
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
desc
.
size
<<
")"
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
)
);
system_allocator_
->
Free
(
block
,
desc
.
size
,
desc
.
index
);
cache_
.
invalidate
(
block
);
cache_
.
invalidate
(
block
);
pool_
.
erase
(
pool_
.
begin
());
pool_
.
erase
(
pool_
.
begin
());
}
}
...
@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
...
@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// if the allocation is huge, send directly to the system allocator
// if the allocation is huge, send directly to the system allocator
if
(
size
>
max_chunk_size_
)
{
if
(
size
>
max_chunk_size_
)
{
VLOG
(
10
)
<<
"Allocate from system allocator."
;
VLOG
(
10
)
<<
"Allocate from system allocator."
;
return
SystemAlloc
(
size
);
return
SystemAlloc
(
size
,
false
);
}
}
// query and allocate from the existing chunk
// query and allocate from the existing chunk
...
@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
...
@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
// refill the pool if failure
if
(
it
==
pool_
.
end
())
{
if
(
it
==
pool_
.
end
())
{
it
=
RefillPool
();
it
=
RefillPool
();
// if still failure,
fail fatally
// if still failure,
try to allocate from SystemAllocator
if
(
it
==
pool_
.
end
())
{
if
(
it
==
pool_
.
end
())
{
return
nullptr
;
return
SystemAlloc
(
size
,
false
)
;
}
}
}
else
{
}
else
{
VLOG
(
10
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
VLOG
(
10
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
...
@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {
...
@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {
VLOG
(
10
)
<<
"Free from address "
<<
block
;
VLOG
(
10
)
<<
"Free from address "
<<
block
;
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
HUGE_CHUNK
)
{
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
UNMANAGED_
HUGE_CHUNK
)
{
VLOG
(
10
)
<<
"Free directly from system allocator"
;
VLOG
(
10
)
<<
"Free directly from system allocator"
;
system_allocator_
->
Free
(
block
,
block
->
total_size
(
cache_
),
system_allocator_
->
Free
(
block
,
block
->
total_size
(
cache_
),
block
->
index
(
cache_
));
block
->
index
(
cache_
));
...
@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {
...
@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {
size_t
BuddyAllocator
::
Used
()
{
return
total_used_
;
}
size_t
BuddyAllocator
::
Used
()
{
return
total_used_
;
}
size_t
BuddyAllocator
::
GetMinChunkSize
()
{
return
min_chunk_size_
;
}
size_t
BuddyAllocator
::
GetMinChunkSize
()
{
return
min_chunk_size_
;
}
size_t
BuddyAllocator
::
GetMaxChunkSize
()
{
return
max_chunk_size_
;
}
size_t
BuddyAllocator
::
GetMaxChunkSize
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
max_chunk_size_
;
}
void
*
BuddyAllocator
::
SystemAlloc
(
size_t
size
)
{
void
*
BuddyAllocator
::
SystemAlloc
(
size_t
size
,
bool
is_managed
)
{
size_t
index
=
0
;
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
size
);
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
size
);
...
@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
...
@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
if
(
p
==
nullptr
)
return
nullptr
;
if
(
p
==
nullptr
)
return
nullptr
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
HUGE_CHUNK
,
index
,
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
size
,
nullptr
,
nullptr
);
&
cache_
,
is_managed
?
MemoryBlock
::
MANAGED_HUGE_CHUNK
:
MemoryBlock
::
UNMANAGED_HUGE_CHUNK
,
index
,
size
,
nullptr
,
nullptr
);
return
static_cast
<
MemoryBlock
*>
(
p
)
->
data
();
return
static_cast
<
MemoryBlock
*>
(
p
)
->
data
();
}
}
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
RefillPool
()
{
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
RefillPool
()
{
#ifdef PADDLE_WITH_CUDA
if
(
total_used_
+
total_free_
>
0
)
{
if
(
system_allocator_
->
UseGpu
())
{
max_chunk_size_
=
reallocate_chunk_size_
;
if
((
total_used_
+
total_free_
)
==
0
)
{
// Compute the maximum allocation size for the first allocation.
max_chunk_size_
=
platform
::
GpuMaxChunkSize
();
}
}
}
#endif
// Allocate a new maximum sized block
// Allocate a new maximum sized block
size_t
index
=
0
;
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
max_chunk_size_
);
size_t
chunk_size
=
max_chunk_size_
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
chunk_size
);
if
(
p
==
nullptr
)
return
pool_
.
end
();
if
(
p
==
nullptr
)
return
pool_
.
end
();
...
@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
...
@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<<
" from system allocator"
;
<<
" from system allocator"
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
FREE_CHUNK
,
index
,
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
FREE_CHUNK
,
index
,
max_chunk_size_
,
nullptr
,
nullptr
);
chunk_size
,
nullptr
,
nullptr
);
// gpu fallback allocation
// gpu fallback allocation
if
(
system_allocator_
->
UseGpu
()
&&
if
(
system_allocator_
->
UseGpu
()
&&
...
@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
...
@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_
++
;
fallback_alloc_count_
++
;
}
}
total_free_
+=
max_chunk_size_
;
total_free_
+=
chunk_size
;
// dump the block into pool
// dump the block into pool
return
pool_
.
insert
(
IndexSizeAddress
(
index
,
max_chunk_size_
,
p
)).
first
;
return
pool_
.
insert
(
IndexSizeAddress
(
index
,
chunk_size
,
p
)).
first
;
}
}
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
FindExistChunk
(
size_t
size
)
{
BuddyAllocator
::
PoolSet
::
iterator
BuddyAllocator
::
FindExistChunk
(
size_t
size
)
{
...
@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
...
@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
void
BuddyAllocator
::
CleanIdleFallBackAlloc
()
{
void
BuddyAllocator
::
CleanIdleFallBackAlloc
()
{
// If fallback allocation does not exist, return directly
// If fallback allocation does not exist, return directly
if
(
!
fallback_alloc_count_
)
return
;
if
(
!
fallback_alloc_count_
||
!
system_allocator_
->
UseGpu
()
)
return
;
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
// If free memory block less than max_chunk_size_, return directly
if
(
std
::
get
<
1
>
(
*
pool
)
<
max_chunk_size_
)
return
;
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
// If no GPU fallback allocator, return
auto
desc
=
cache_
.
load
(
block
);
if
(
!
system_allocator_
->
UseGpu
()
||
block
->
index
(
cache_
)
==
0
)
{
if
(
desc
.
index
==
0
)
{
return
;
return
;
}
}
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
system_allocator_
->
Free
(
block
,
desc
.
size
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
cache_
.
invalidate
(
block
);
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
total_free_
-=
max_chunk_size_
;
total_free_
-=
desc
.
size
;
fallback_alloc_count_
--
;
fallback_alloc_count_
--
;
// If no fall allocation exists, return directly
// If no fall allocation exists, return directly
...
@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
...
@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
if
(
!
shall_free_alloc
())
return
;
if
(
!
shall_free_alloc
())
return
;
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
for
(
auto
pool
=
pool_
.
rbegin
();
pool
!=
pool_
.
rend
();)
{
// If free memory block less than max_chunk_size_, return directly
if
(
std
::
get
<
1
>
(
*
pool
)
<
max_chunk_size_
)
return
;
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
auto
desc
=
cache_
.
load
(
block
);
if
(
desc
.
type
!=
MemoryBlock
::
MANAGED_HUGE_CHUNK
)
{
return
;
}
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
)
);
system_allocator_
->
Free
(
block
,
desc
.
size
,
desc
.
index
);
cache_
.
invalidate
(
block
);
cache_
.
invalidate
(
block
);
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
pool
=
PoolSet
::
reverse_iterator
(
pool_
.
erase
(
std
::
next
(
pool
).
base
()));
total_free_
-=
max_chunk_size_
;
total_free_
-=
desc
.
size
;
if
(
!
shall_free_alloc
())
return
;
if
(
!
shall_free_alloc
())
return
;
}
}
...
...
paddle/fluid/memory/detail/buddy_allocator.h
浏览文件 @
2c4fcaa6
...
@@ -34,7 +34,8 @@ namespace detail {
...
@@ -34,7 +34,8 @@ namespace detail {
class
BuddyAllocator
{
class
BuddyAllocator
{
public:
public:
BuddyAllocator
(
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
BuddyAllocator
(
std
::
unique_ptr
<
SystemAllocator
>
system_allocator
,
size_t
min_chunk_size
,
size_t
max_chunk_size
);
size_t
min_chunk_size
,
size_t
first_allocate_chunk_size
,
size_t
reallocate_chunk_size
);
~
BuddyAllocator
();
~
BuddyAllocator
();
...
@@ -57,7 +58,7 @@ class BuddyAllocator {
...
@@ -57,7 +58,7 @@ class BuddyAllocator {
using
PoolSet
=
std
::
set
<
IndexSizeAddress
>
;
using
PoolSet
=
std
::
set
<
IndexSizeAddress
>
;
/*! \brief Allocate fixed-size memory from system */
/*! \brief Allocate fixed-size memory from system */
void
*
SystemAlloc
(
size_t
size
);
void
*
SystemAlloc
(
size_t
size
,
bool
is_managed
=
true
);
/*! \brief If existing chunks are not suitable, refill pool */
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet
::
iterator
RefillPool
();
PoolSet
::
iterator
RefillPool
();
...
@@ -87,7 +88,11 @@ class BuddyAllocator {
...
@@ -87,7 +88,11 @@ class BuddyAllocator {
size_t
total_free_
=
0
;
// the total size of free memory
size_t
total_free_
=
0
;
// the total size of free memory
size_t
min_chunk_size_
;
// the minimum size of each chunk
size_t
min_chunk_size_
;
// the minimum size of each chunk
size_t
max_chunk_size_
;
// the maximum size of each chunk
size_t
first_allocate_chunk_size_
;
size_t
reallocate_chunk_size_
;
size_t
max_chunk_size_
;
private:
private:
/**
/**
...
...
paddle/fluid/memory/detail/memory_block.h
浏览文件 @
2c4fcaa6
...
@@ -27,10 +27,11 @@ class MetadataCache;
...
@@ -27,10 +27,11 @@ class MetadataCache;
// MemoryBlock::Desc and the payload.
// MemoryBlock::Desc and the payload.
struct
MemoryBlock
{
struct
MemoryBlock
{
enum
Type
{
enum
Type
{
FREE_CHUNK
,
// memory is free and idle
FREE_CHUNK
,
// memory is free and idle
ARENA_CHUNK
,
// memory is being occupied
ARENA_CHUNK
,
// memory is being occupied
HUGE_CHUNK
,
// memory is out of management
MANAGED_HUGE_CHUNK
,
// memory is huge and out of management
INVALID_CHUNK
// memory is invalid
UNMANAGED_HUGE_CHUNK
,
// memory is huge and managed by allocator
INVALID_CHUNK
// memory is invalid
};
};
// init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
// init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
...
...
paddle/fluid/operators/benchmark/op_tester.cc
浏览文件 @
2c4fcaa6
...
@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
...
@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
// Initialize the OpDesc
// Initialize the OpDesc
if
(
op_desc_info
.
Has
(
config_
.
op_type
))
{
if
(
op_desc_info
.
Has
(
config_
.
op_type
))
{
type_
=
config_
.
op_type
;
type_
=
config_
.
op_type
;
op_desc_
.
SetType
(
config_
.
op_type
);
CreateOpDesc
();
CreateInputVarDesc
();
CreateInputVarDesc
();
CreateOutputVarDesc
();
CreateOutputVarDesc
();
}
else
{
}
else
{
...
@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
...
@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
return
output_names
;
return
output_names
;
}
}
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
OpTester
::
GetOpProtoAttrNames
()
{
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
attr_types
;
const
framework
::
proto
::
OpProto
&
proto
=
framework
::
OpInfoMap
::
Instance
().
Get
(
type_
).
Proto
();
const
std
::
vector
<
std
::
string
>
skipped_attrs
=
{
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpNamescopeAttrName
(),
framework
::
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
()};
for
(
int
i
=
0
;
i
!=
proto
.
attrs_size
();
++
i
)
{
const
auto
&
attr
=
proto
.
attrs
(
i
);
if
(
!
Has
(
skipped_attrs
,
attr
.
name
()))
{
VLOG
(
4
)
<<
"attr: "
<<
attr
.
name
()
<<
", type: "
<<
attr
.
type
();
attr_types
[
attr
.
name
()]
=
attr
.
type
();
}
}
return
attr_types
;
}
framework
::
proto
::
VarType
::
Type
OpTester
::
TransToVarType
(
std
::
string
str
)
{
if
(
str
==
"int32"
)
{
return
framework
::
proto
::
VarType
::
INT32
;
}
else
if
(
str
==
"int64"
)
{
return
framework
::
proto
::
VarType
::
INT64
;
}
else
if
(
str
==
"fp32"
)
{
return
framework
::
proto
::
VarType
::
FP32
;
}
else
if
(
str
==
"fp64"
)
{
return
framework
::
proto
::
VarType
::
FP64
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s."
,
str
.
c_str
());
}
}
void
OpTester
::
CreateInputVarDesc
()
{
void
OpTester
::
CreateInputVarDesc
()
{
std
::
vector
<
std
::
string
>
input_names
=
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
input_names
=
GetOpProtoInputNames
();
for
(
auto
&
name
:
input_names
)
{
for
(
auto
&
name
:
input_names
)
{
...
@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
...
@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
// Need to support more type
// Need to support more type
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetPersistable
(
false
);
var
->
SetPersistable
(
false
);
var
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
var
->
SetDataType
(
TransToVarType
(
input
->
dtype
)
);
var
->
SetShape
(
input
->
dims
);
var
->
SetShape
(
input
->
dims
);
op_desc_
.
SetInput
(
name
,
{
var_name
});
op_desc_
.
SetInput
(
name
,
{
var_name
});
input
_lods_
[
var_name
]
=
input
->
lod
;
input
s_
[
var_name
]
=
*
input
;
}
}
}
}
...
@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
...
@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
}
}
}
}
void
OpTester
::
CreateOpDesc
()
{
op_desc_
.
SetType
(
config_
.
op_type
);
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
attr_types
=
GetOpProtoAttrNames
();
for
(
auto
item
:
config_
.
attrs
)
{
const
std
::
string
&
name
=
item
.
first
;
if
(
attr_types
.
find
(
name
)
==
attr_types
.
end
())
{
LOG
(
FATAL
)
<<
"Operator "
<<
type_
<<
" do not have attr "
<<
name
;
}
const
std
::
string
&
value_str
=
item
.
second
;
const
framework
::
proto
::
AttrType
&
type
=
attr_types
[
name
];
switch
(
type
)
{
case
framework
::
proto
::
AttrType
::
BOOLEAN
:
break
;
case
framework
::
proto
::
AttrType
::
INT
:
{
int
value
=
StringTo
<
int
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
{
value
});
}
break
;
case
framework
::
proto
::
AttrType
::
FLOAT
:
{
float
value
=
StringTo
<
float
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
{
value
});
}
break
;
case
framework
::
proto
::
AttrType
::
STRING
:
{
op_desc_
.
SetAttr
(
name
,
{
value_str
});
}
break
;
case
framework
::
proto
::
AttrType
::
BOOLEANS
:
case
framework
::
proto
::
AttrType
::
INTS
:
case
framework
::
proto
::
AttrType
::
FLOATS
:
case
framework
::
proto
::
AttrType
::
STRINGS
:
LOG
(
FATAL
)
<<
"Not supported yet."
;
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
int64_t
value
=
StringTo
<
int64_t
>
(
value_str
);
op_desc_
.
SetAttr
(
name
,
value
);
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
type
);
}
}
}
framework
::
VarDesc
*
OpTester
::
Var
(
const
std
::
string
&
name
)
{
framework
::
VarDesc
*
OpTester
::
Var
(
const
std
::
string
&
name
)
{
auto
it
=
vars_
.
find
(
name
);
auto
it
=
vars_
.
find
(
name
);
if
(
it
!=
vars_
.
end
())
{
if
(
it
!=
vars_
.
end
())
{
...
@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
...
@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
template
<
typename
T
>
template
<
typename
T
>
void
OpTester
::
SetupTensor
(
framework
::
LoDTensor
*
tensor
,
void
OpTester
::
SetupTensor
(
framework
::
LoDTensor
*
tensor
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
,
T
upp
er
)
{
const
std
::
string
&
initializ
er
)
{
static
unsigned
int
seed
=
100
;
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
T
*
ptr
=
tensor
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
place_
);
T
*
ptr
=
tensor
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
place_
);
if
(
platform
::
is_cpu_place
(
place_
))
{
for
(
int
i
=
0
;
i
<
tensor
->
numel
();
++
i
)
{
framework
::
LoDTensor
cpu_tensor
;
ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
T
*
cpu_ptr
=
nullptr
;
}
if
(
!
platform
::
is_cpu_place
(
place_
))
{
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
platform
::
CPUPlace
());
}
else
{
}
else
{
framework
::
LoDTensor
cpu_tensor
;
cpu_ptr
=
ptr
;
T
*
cpu_ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
shape
),
}
platform
::
CPUPlace
());
if
(
initializer
==
"random"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
cpu_ptr
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
}
else
if
(
initializer
==
"natural"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
lower
+
i
;
}
}
else
if
(
initializer
==
"zeros"
)
{
for
(
int
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
0
;
}
}
else
{
PADDLE_THROW
(
"Unsupported initializer %s."
,
initializer
.
c_str
());
}
if
(
!
platform
::
is_cpu_place
(
place_
))
{
TensorCopySync
(
cpu_tensor
,
place_
,
tensor
);
TensorCopySync
(
cpu_tensor
,
place_
,
tensor
);
}
}
}
}
...
@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
...
@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
}
}
}
}
for
(
auto
&
item
:
input
_lod
s_
)
{
for
(
auto
&
item
:
inputs_
)
{
// Allocate memory for input tensor
// Allocate memory for input tensor
auto
&
var_name
=
item
.
first
;
auto
&
var_name
=
item
.
first
;
VLOG
(
3
)
<<
"Allocate memory for tensor "
<<
var_name
;
VLOG
(
3
)
<<
"Allocate memory for tensor "
<<
var_name
;
...
@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {
...
@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {
auto
*
var
=
scope
->
Var
(
var_name
);
auto
*
var
=
scope
->
Var
(
var_name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
SetupTensor
<
float
>
(
tensor
,
shape
,
static_cast
<
float
>
(
0.0
),
const
auto
&
data_type
=
var_desc
->
GetDataType
();
static_cast
<
float
>
(
1.0
));
if
(
data_type
==
framework
::
proto
::
VarType
::
INT32
)
{
SetupTensor
<
int
>
(
tensor
,
shape
,
0
,
1
,
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
INT64
)
{
SetupTensor
<
int64_t
>
(
tensor
,
shape
,
0
,
1
,
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP32
)
{
SetupTensor
<
float
>
(
tensor
,
shape
,
static_cast
<
float
>
(
0.0
),
static_cast
<
float
>
(
1.0
),
item
.
second
.
initializer
);
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
SetupTensor
<
double
>
(
tensor
,
shape
,
static_cast
<
double
>
(
0.0
),
static_cast
<
double
>
(
1.0
),
item
.
second
.
initializer
);
}
else
{
PADDLE_THROW
(
"Unsupported dtype %d."
,
data_type
);
}
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
std
::
vector
<
std
::
vector
<
size_t
>>
&
lod_vec
=
item
.
second
;
std
::
vector
<
std
::
vector
<
size_t
>>
&
lod_vec
=
item
.
second
.
lod
;
framework
::
LoD
lod
;
framework
::
LoD
lod
;
for
(
size_t
i
=
0
;
i
<
lod_vec
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
lod_vec
.
size
();
++
i
)
{
lod
.
push_back
(
lod_vec
[
i
]);
lod
.
push_back
(
lod_vec
[
i
]);
...
@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
...
@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
ss
<<
GenSpaces
(
count
)
<<
"type: LOD_TENSOR
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"type: LOD_TENSOR
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"lod_tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"lod_tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"tensor {
\n
"
;
ss
<<
GenSpaces
(
count
++
)
<<
"tensor {
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP32
\n
"
;
const
auto
&
data_type
=
var
->
GetDataType
();
if
(
data_type
==
framework
::
proto
::
VarType
::
INT32
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: INT32
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
INT64
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: INT64
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP32
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP32
\n
"
;
}
else
if
(
data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
ss
<<
GenSpaces
(
count
)
<<
"data_type: FP64
\n
"
;
}
std
::
vector
<
int64_t
>
shape
=
var
->
GetShape
();
std
::
vector
<
int64_t
>
shape
=
var
->
GetShape
();
for
(
auto
d
:
shape
)
{
for
(
auto
d
:
shape
)
{
ss
<<
GenSpaces
(
count
)
<<
"dims: "
<<
d
<<
"
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"dims: "
<<
d
<<
"
\n
"
;
...
@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
...
@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
}
ss
<<
GenSpaces
(
count
)
<<
"type: "
<<
op_desc_
.
Type
()
<<
"
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"type: "
<<
op_desc_
.
Type
()
<<
"
\n
"
;
for
(
auto
&
name
:
op_desc_
.
AttrNames
())
{
ss
<<
GenSpaces
(
count
++
)
<<
"attrs {
\n
"
;
const
auto
&
attr_type
=
op_desc_
.
GetAttrType
(
name
);
const
auto
&
attr
=
op_desc_
.
GetAttr
(
name
);
ss
<<
GenSpaces
(
count
)
<<
"name:
\"
"
<<
name
<<
"
\"\n
"
;
switch
(
attr_type
)
{
case
framework
::
proto
::
AttrType
::
BOOLEAN
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: BOOLEAN
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"b: "
<<
boost
::
get
<
bool
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
INT
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: INT
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"i: "
<<
boost
::
get
<
int
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
FLOAT
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: FLOAT
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"f: "
<<
boost
::
get
<
float
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
STRING
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: STRING
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"s:
\"
"
<<
boost
::
get
<
std
::
string
>
(
attr
)
<<
"
\"\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
BOOLEANS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: BOOLEANS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"bools: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
INTS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: INTS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"ints: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
FLOATS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: FLOATS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"floats: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
STRINGS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: STRINGS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"strings: "
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: LONG
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"l: "
<<
boost
::
get
<
int64_t
>
(
attr
)
<<
"
\n
"
;
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
{
ss
<<
GenSpaces
(
count
)
<<
"type: LONGS
\n
"
;
ss
<<
GenSpaces
(
count
)
<<
"longs: "
<<
"
\n
"
;
}
break
;
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
attr_type
);
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
return
ss
.
str
();
return
ss
.
str
();
}
}
...
@@ -299,6 +471,7 @@ TEST(op_tester, base) {
...
@@ -299,6 +471,7 @@ TEST(op_tester, base) {
FLAGS_op_config_list
.
c_str
());
FLAGS_op_config_list
.
c_str
());
std
::
vector
<
OpTesterConfig
>
op_configs
;
std
::
vector
<
OpTesterConfig
>
op_configs
;
while
(
!
fin
.
eof
())
{
while
(
!
fin
.
eof
())
{
VLOG
(
4
)
<<
"Reading config "
<<
op_configs
.
size
()
<<
"..."
;
OpTesterConfig
config
;
OpTesterConfig
config
;
bool
result
=
config
.
Init
(
fin
);
bool
result
=
config
.
Init
(
fin
);
if
(
result
)
{
if
(
result
)
{
...
...
paddle/fluid/operators/benchmark/op_tester.h
浏览文件 @
2c4fcaa6
...
@@ -14,7 +14,9 @@ limitations under the License. */
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#pragma once
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_desc.h"
...
@@ -39,16 +41,21 @@ class OpTester {
...
@@ -39,16 +41,21 @@ class OpTester {
private:
private:
std
::
vector
<
std
::
string
>
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoInputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoOutputNames
();
std
::
vector
<
std
::
string
>
GetOpProtoOutputNames
();
std
::
unordered_map
<
std
::
string
,
framework
::
proto
::
AttrType
>
GetOpProtoAttrNames
();
framework
::
proto
::
VarType
::
Type
TransToVarType
(
std
::
string
str
);
void
CreateInputVarDesc
();
void
CreateInputVarDesc
();
void
CreateOutputVarDesc
();
void
CreateOutputVarDesc
();
void
CreateOpDesc
();
framework
::
VarDesc
*
Var
(
const
std
::
string
&
name
);
framework
::
VarDesc
*
Var
(
const
std
::
string
&
name
);
void
CreateVariables
(
framework
::
Scope
*
scope
);
void
CreateVariables
(
framework
::
Scope
*
scope
);
template
<
typename
T
>
template
<
typename
T
>
void
SetupTensor
(
framework
::
LoDTensor
*
input
,
void
SetupTensor
(
framework
::
LoDTensor
*
input
,
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
);
const
std
::
vector
<
int64_t
>
&
shape
,
T
lower
,
T
upper
,
const
std
::
string
&
initializer
);
void
RunImpl
();
void
RunImpl
();
...
@@ -57,7 +64,7 @@ class OpTester {
...
@@ -57,7 +64,7 @@ class OpTester {
std
::
string
type_
;
std
::
string
type_
;
framework
::
OpDesc
op_desc_
;
framework
::
OpDesc
op_desc_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
framework
::
VarDesc
>>
vars_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
framework
::
VarDesc
>>
vars_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
vector
<
size_t
>>>
input_lod
s_
;
std
::
unordered_map
<
std
::
string
,
OpInputConfig
>
input
s_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
platform
::
Place
place_
;
platform
::
Place
place_
;
std
::
unique_ptr
<
framework
::
Scope
>
scope_
;
std
::
unique_ptr
<
framework
::
Scope
>
scope_
;
...
...
paddle/fluid/operators/benchmark/op_tester_config.cc
浏览文件 @
2c4fcaa6
...
@@ -14,7 +14,6 @@ limitations under the License. */
...
@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
...
@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
}
}
}
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dtype"
||
sep
==
"dtype:"
)
{
ParseDType
(
is
);
}
else
if
(
sep
==
"initializer"
||
sep
==
"initializer:"
)
{
ParseInitializer
(
is
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
ParseDims
(
is
);
}
else
if
(
sep
==
"lod"
||
sep
==
"lod:"
)
{
ParseLoD
(
is
);
}
}
}
}
void
OpInputConfig
::
ParseDType
(
std
::
istream
&
is
)
{
std
::
string
dtype_str
;
is
>>
dtype_str
;
EraseEndSep
(
&
dtype_str
);
if
(
dtype_str
==
"int32"
||
dtype_str
==
"int"
)
{
dtype
=
"int32"
;
}
else
if
(
dtype_str
==
"int64"
||
dtype_str
==
"long"
)
{
dtype
=
"int64"
;
}
else
if
(
dtype_str
==
"fp32"
||
dtype_str
==
"float"
)
{
dtype
=
"fp32"
;
}
else
if
(
dtype_str
==
"fp64"
||
dtype_str
==
"double"
)
{
dtype
=
"fp64"
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s"
,
dtype_str
.
c_str
());
}
VLOG
(
4
)
<<
"dtype of input "
<<
name
<<
" is: "
<<
dtype
;
}
void
OpInputConfig
::
ParseInitializer
(
std
::
istream
&
is
)
{
std
::
string
initializer_str
;
is
>>
initializer_str
;
EraseEndSep
(
&
initializer_str
);
const
std
::
vector
<
std
::
string
>
supported_initializers
=
{
"random"
,
"natural"
,
"zeros"
};
if
(
!
Has
(
supported_initializers
,
initializer_str
))
{
PADDLE_THROW
(
"Unsupported initializer %s"
,
initializer_str
.
c_str
());
}
initializer
=
initializer_str
;
VLOG
(
4
)
<<
"initializer of input "
<<
name
<<
" is: "
<<
initializer
;
}
void
OpInputConfig
::
ParseDims
(
std
::
istream
&
is
)
{
void
OpInputConfig
::
ParseDims
(
std
::
istream
&
is
)
{
std
::
string
dims_str
;
std
::
string
dims_str
;
is
>>
dims_str
;
is
>>
dims_str
;
...
@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
...
@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
number
+=
lod_str
[
i
];
number
+=
lod_str
[
i
];
++
i
;
++
i
;
}
}
level
.
push_back
(
atoi
(
number
.
c_str
()
));
level
.
push_back
(
StringTo
<
size_t
>
(
number
));
}
}
lod
.
push_back
(
level
);
lod
.
push_back
(
level
);
}
else
if
(
lod_str
[
i
]
==
'}'
)
{
}
else
if
(
lod_str
[
i
]
==
'}'
)
{
...
@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
...
@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
}
}
}
}
OpInputConfig
::
OpInputConfig
(
std
::
istream
&
is
)
{
std
::
string
sep
;
is
>>
sep
;
if
(
sep
==
kStartSeparator
)
{
while
(
sep
!=
kEndSeparator
)
{
is
>>
sep
;
if
(
sep
==
"name"
||
sep
==
"name:"
)
{
is
>>
name
;
EraseEndSep
(
&
name
);
}
else
if
(
sep
==
"dims"
||
sep
==
"dims:"
)
{
ParseDims
(
is
);
}
else
if
(
sep
==
"lod"
||
sep
==
"lod:"
)
{
ParseLoD
(
is
);
}
}
}
}
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
...
@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
...
@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
is
>>
value
;
is
>>
value
;
EraseEndSep
(
&
key
,
":"
);
EraseEndSep
(
&
key
,
":"
);
EraseEndSep
(
&
value
);
EraseEndSep
(
&
value
);
VLOG
(
4
)
<<
"attrs: "
<<
key
<<
", "
<<
value
;
attrs
[
key
]
=
value
;
attrs
[
key
]
=
value
;
}
}
...
...
paddle/fluid/operators/benchmark/op_tester_config.h
浏览文件 @
2c4fcaa6
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <istream>
#include <istream>
#include <sstream>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
...
@@ -27,10 +28,14 @@ struct OpInputConfig {
...
@@ -27,10 +28,14 @@ struct OpInputConfig {
OpInputConfig
()
{}
OpInputConfig
()
{}
explicit
OpInputConfig
(
std
::
istream
&
is
);
explicit
OpInputConfig
(
std
::
istream
&
is
);
void
ParseDType
(
std
::
istream
&
is
);
void
ParseInitializer
(
std
::
istream
&
is
);
void
ParseDims
(
std
::
istream
&
is
);
void
ParseDims
(
std
::
istream
&
is
);
void
ParseLoD
(
std
::
istream
&
is
);
void
ParseLoD
(
std
::
istream
&
is
);
std
::
string
name
;
std
::
string
name
;
std
::
string
dtype
{
"fp32"
};
// int32/int, int64/long, fp32/float, fp64/double
std
::
string
initializer
{
"random"
};
// random, natural
std
::
vector
<
int64_t
>
dims
;
std
::
vector
<
int64_t
>
dims
;
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
std
::
vector
<
std
::
vector
<
size_t
>>
lod
;
};
};
...
@@ -55,6 +60,23 @@ struct OpTesterConfig {
...
@@ -55,6 +60,23 @@ struct OpTesterConfig {
double
runtime
{
0.0
};
double
runtime
{
0.0
};
};
};
static
bool
Has
(
const
std
::
vector
<
std
::
string
>&
vec
,
const
std
::
string
&
item
)
{
for
(
size_t
i
=
0
;
i
<
vec
.
size
();
++
i
)
{
if
(
vec
[
i
]
==
item
)
{
return
true
;
}
}
return
false
;
}
template
<
typename
T
>
T
StringTo
(
const
std
::
string
&
str
)
{
std
::
istringstream
is
(
str
);
T
value
;
is
>>
value
;
return
value
;
}
}
// namespace benchmark
}
// namespace benchmark
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/cast_op.cc
浏览文件 @
2c4fcaa6
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/float16.h"
...
@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
Cast Operator.
Cast Operator.
This Operator casts the input tensor to another data type and
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
returns the Output Tensor. It's meaningless if the output dtype equals
the input dtype, but it's fine if you do so.
)DOC"
);
)DOC"
);
}
}
...
...
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
...
@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library
(
generate_proposal_labels_op SRCS generate_proposal_labels_op.cc
)
detection_library
(
generate_proposal_labels_op SRCS generate_proposal_labels_op.cc
)
detection_library
(
box_clip_op SRCS box_clip_op.cc box_clip_op.cu
)
detection_library
(
box_clip_op SRCS box_clip_op.cc box_clip_op.cu
)
detection_library
(
yolov3_loss_op SRCS yolov3_loss_op.cc
)
detection_library
(
yolov3_loss_op SRCS yolov3_loss_op.cc
)
detection_library
(
box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub
)
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub
)
detection_library
(
distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub
)
else
()
else
()
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc
)
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc
)
detection_library
(
distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc
)
endif
()
endif
()
detection_library
(
roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu
)
detection_library
(
roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu
)
...
...
paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
class
BoxDecoderAndAssignOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PriorBox"
),
"Input(PriorBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PriorBoxVar"
),
"Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"TargetBox"
),
"Input(TargetBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BoxScore"
),
"Input(BoxScore) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"DecodeBox"
),
"Output(DecodeBox) of BoxDecoderAndAssignOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"OutputAssignBox"
),
"Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null."
);
auto
prior_box_dims
=
ctx
->
GetInputDim
(
"PriorBox"
);
auto
prior_box_var_dims
=
ctx
->
GetInputDim
(
"PriorBoxVar"
);
auto
target_box_dims
=
ctx
->
GetInputDim
(
"TargetBox"
);
auto
box_score_dims
=
ctx
->
GetInputDim
(
"BoxScore"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
.
size
(),
2
,
"The rank of Input of PriorBox must be 2"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
1
],
4
,
"The shape of PriorBox is [N, 4]"
);
PADDLE_ENFORCE_EQ
(
prior_box_var_dims
.
size
(),
1
,
"The rank of Input of PriorBoxVar must be 1"
);
PADDLE_ENFORCE_EQ
(
prior_box_var_dims
[
0
],
4
,
"The shape of PriorBoxVar is [4]"
);
PADDLE_ENFORCE_EQ
(
target_box_dims
.
size
(),
2
,
"The rank of Input of TargetBox must be 2"
);
PADDLE_ENFORCE_EQ
(
box_score_dims
.
size
(),
2
,
"The rank of Input of BoxScore must be 2"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
0
],
target_box_dims
[
0
],
"The first dim of prior_box and target_box is roi nums "
"and should be same!"
);
PADDLE_ENFORCE_EQ
(
prior_box_dims
[
0
],
box_score_dims
[
0
],
"The first dim of prior_box and box_score is roi nums "
"and should be same!"
);
PADDLE_ENFORCE_EQ
(
target_box_dims
[
1
],
box_score_dims
[
1
]
*
prior_box_dims
[
1
],
"The shape of target_box is [N, classnum * 4], The shape "
"of box_score is [N, classnum], The shape of prior_box "
"is [N, 4]"
);
ctx
->
SetOutputDim
(
"DecodeBox"
,
framework
::
make_ddim
({
target_box_dims
[
0
],
target_box_dims
[
1
]}));
ctx
->
ShareLoD
(
"TargetBox"
,
/*->*/
"DecodeBox"
);
ctx
->
SetOutputDim
(
"OutputAssignBox"
,
framework
::
make_ddim
({
prior_box_dims
[
0
],
prior_box_dims
[
1
]}));
ctx
->
ShareLoD
(
"PriorBox"
,
/*->*/
"OutputAssignBox"
);
}
};
class
BoxDecoderAndAssignOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"PriorBox"
,
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
"boxes and each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box."
);
AddInput
(
"PriorBoxVar"
,
"(Tensor, default Tensor<float>, optional) "
"PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
"group of variance. PriorBoxVar will set all elements to 1 by "
"default."
)
.
AsDispensable
();
AddInput
(
"TargetBox"
,
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum*4]. It holds N targets for N boxes."
);
AddInput
(
"BoxScore"
,
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum], each box is represented as [classnum] which is "
"the classification probabilities."
);
AddAttr
<
float
>
(
"box_clip"
,
"(float, default 4.135, np.log(1000. / 16.)) "
"clip box to prevent overflowing"
)
.
SetDefault
(
4.135
f
);
AddOutput
(
"DecodeBox"
,
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, classnum * 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances for each class."
);
AddOutput
(
"OutputAssignBox"
,
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances with the best non-background class "
"by BoxScore."
);
AddComment
(
R"DOC(
Bounding Box Coder.
Decode the target bounding box with the prior_box information.
The Decoding schema is described below:
$$
ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2}
$$
$$
oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
$$
$$
ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
$$
$$
oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
$$
where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
decoded coordinates, width and height in decode_box.
decode_box is obtained after box decode, then assigning schema is described below:
For each prior_box, use the best non-background class's decoded values to
update the prior_box locations and get output_assign_box. So, the shape of
output_assign_box is the same as PriorBox.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignOp
,
ops
::
BoxDecoderAndAssignOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
BoxDecoderAndAssignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
DecodeBoxKernel
(
const
T
*
prior_box_data
,
const
T
*
prior_box_var_data
,
const
T
*
target_box_data
,
const
int
roi_num
,
const
int
class_num
,
const
T
box_clip
,
T
*
output_box_data
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
roi_num
*
class_num
)
{
int
i
=
idx
/
class_num
;
int
j
=
idx
%
class_num
;
T
prior_box_width
=
prior_box_data
[
i
*
4
+
2
]
-
prior_box_data
[
i
*
4
]
+
1
;
T
prior_box_height
=
prior_box_data
[
i
*
4
+
3
]
-
prior_box_data
[
i
*
4
+
1
]
+
1
;
T
prior_box_center_x
=
prior_box_data
[
i
*
4
]
+
prior_box_width
/
2
;
T
prior_box_center_y
=
prior_box_data
[
i
*
4
+
1
]
+
prior_box_height
/
2
;
int
offset
=
i
*
class_num
*
4
+
j
*
4
;
T
dw
=
prior_box_var_data
[
2
]
*
target_box_data
[
offset
+
2
];
T
dh
=
prior_box_var_data
[
3
]
*
target_box_data
[
offset
+
3
];
if
(
dw
>
box_clip
)
{
dw
=
box_clip
;
}
if
(
dh
>
box_clip
)
{
dh
=
box_clip
;
}
T
target_box_center_x
=
0
,
target_box_center_y
=
0
;
T
target_box_width
=
0
,
target_box_height
=
0
;
target_box_center_x
=
prior_box_var_data
[
0
]
*
target_box_data
[
offset
]
*
prior_box_width
+
prior_box_center_x
;
target_box_center_y
=
prior_box_var_data
[
1
]
*
target_box_data
[
offset
+
1
]
*
prior_box_height
+
prior_box_center_y
;
target_box_width
=
expf
(
dw
)
*
prior_box_width
;
target_box_height
=
expf
(
dh
)
*
prior_box_height
;
output_box_data
[
offset
]
=
target_box_center_x
-
target_box_width
/
2
;
output_box_data
[
offset
+
1
]
=
target_box_center_y
-
target_box_height
/
2
;
output_box_data
[
offset
+
2
]
=
target_box_center_x
+
target_box_width
/
2
-
1
;
output_box_data
[
offset
+
3
]
=
target_box_center_y
+
target_box_height
/
2
-
1
;
}
}
template
<
typename
T
>
__global__
void
AssignBoxKernel
(
const
T
*
prior_box_data
,
const
T
*
box_score_data
,
T
*
output_box_data
,
const
int
roi_num
,
const
int
class_num
,
T
*
output_assign_box_data
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
roi_num
)
{
int
i
=
idx
;
T
max_score
=
-
1
;
int
max_j
=
-
1
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
T
score
=
box_score_data
[
i
*
class_num
+
j
];
if
(
score
>
max_score
&&
j
>
0
)
{
max_score
=
score
;
max_j
=
j
;
}
}
if
(
max_j
>
0
)
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
output_box_data
[
i
*
class_num
*
4
+
max_j
*
4
+
pno
];
}
}
else
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
prior_box_data
[
i
*
4
+
pno
];
}
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
BoxDecoderAndAssignCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
context
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
*
prior_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
box_score
=
context
.
Input
<
framework
::
LoDTensor
>
(
"BoxScore"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"DecodeBox"
);
auto
*
output_assign_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputAssignBox"
);
auto
roi_num
=
target_box
->
dims
()[
0
];
auto
class_num
=
box_score
->
dims
()[
1
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
auto
*
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
auto
*
box_score_data
=
box_score
->
data
<
T
>
();
output_box
->
mutable_data
<
T
>
({
roi_num
,
class_num
*
4
},
context
.
GetPlace
());
output_assign_box
->
mutable_data
<
T
>
({
roi_num
,
4
},
context
.
GetPlace
());
T
*
output_box_data
=
output_box
->
data
<
T
>
();
T
*
output_assign_box_data
=
output_assign_box
->
data
<
T
>
();
int
block
=
512
;
int
grid
=
(
roi_num
*
class_num
+
block
-
1
)
/
block
;
auto
&
device_ctx
=
context
.
cuda_device_context
();
const
T
box_clip
=
context
.
Attr
<
T
>
(
"box_clip"
);
DecodeBoxKernel
<
T
><<<
grid
,
block
,
0
,
device_ctx
.
stream
()
>>>
(
prior_box_data
,
prior_box_var_data
,
target_box_data
,
roi_num
,
class_num
,
box_clip
,
output_box_data
);
context
.
device_context
().
Wait
();
int
assign_grid
=
(
roi_num
+
block
-
1
)
/
block
;
AssignBoxKernel
<
T
><<<
assign_grid
,
block
,
0
,
device_ctx
.
stream
()
>>>
(
prior_box_data
,
box_score_data
,
output_box_data
,
roi_num
,
class_num
,
output_assign_box_data
);
context
.
device_context
().
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
box_decoder_and_assign
,
ops
::
BoxDecoderAndAssignCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
BoxDecoderAndAssignCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/detection/box_decoder_and_assign_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
BoxDecoderAndAssignKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
prior_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
box_score
=
context
.
Input
<
framework
::
LoDTensor
>
(
"BoxScore"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"DecodeBox"
);
auto
*
output_assign_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputAssignBox"
);
int
roi_num
=
target_box
->
dims
()[
0
];
int
class_num
=
box_score
->
dims
()[
1
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
auto
*
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
auto
*
box_score_data
=
box_score
->
data
<
T
>
();
output_box
->
mutable_data
<
T
>
({
roi_num
,
class_num
*
4
},
context
.
GetPlace
());
output_assign_box
->
mutable_data
<
T
>
({
roi_num
,
4
},
context
.
GetPlace
());
T
*
output_box_data
=
output_box
->
data
<
T
>
();
T
*
output_assign_box_data
=
output_assign_box
->
data
<
T
>
();
const
T
bbox_clip
=
context
.
Attr
<
T
>
(
"box_clip"
);
for
(
int
i
=
0
;
i
<
roi_num
;
++
i
)
{
T
prior_box_width
=
prior_box_data
[
i
*
4
+
2
]
-
prior_box_data
[
i
*
4
]
+
1
;
T
prior_box_height
=
prior_box_data
[
i
*
4
+
3
]
-
prior_box_data
[
i
*
4
+
1
]
+
1
;
T
prior_box_center_x
=
prior_box_data
[
i
*
4
]
+
prior_box_width
/
2
;
T
prior_box_center_y
=
prior_box_data
[
i
*
4
+
1
]
+
prior_box_height
/
2
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
int64_t
offset
=
i
*
class_num
*
4
+
j
*
4
;
T
dw
=
std
::
min
(
prior_box_var_data
[
2
]
*
target_box_data
[
offset
+
2
],
bbox_clip
);
T
dh
=
std
::
min
(
prior_box_var_data
[
3
]
*
target_box_data
[
offset
+
3
],
bbox_clip
);
T
target_box_center_x
=
0
,
target_box_center_y
=
0
;
T
target_box_width
=
0
,
target_box_height
=
0
;
target_box_center_x
=
prior_box_var_data
[
0
]
*
target_box_data
[
offset
]
*
prior_box_width
+
prior_box_center_x
;
target_box_center_y
=
prior_box_var_data
[
1
]
*
target_box_data
[
offset
+
1
]
*
prior_box_height
+
prior_box_center_y
;
target_box_width
=
std
::
exp
(
dw
)
*
prior_box_width
;
target_box_height
=
std
::
exp
(
dh
)
*
prior_box_height
;
output_box_data
[
offset
]
=
target_box_center_x
-
target_box_width
/
2
;
output_box_data
[
offset
+
1
]
=
target_box_center_y
-
target_box_height
/
2
;
output_box_data
[
offset
+
2
]
=
target_box_center_x
+
target_box_width
/
2
-
1
;
output_box_data
[
offset
+
3
]
=
target_box_center_y
+
target_box_height
/
2
-
1
;
}
T
max_score
=
-
1
;
int
max_j
=
-
1
;
for
(
int
j
=
0
;
j
<
class_num
;
++
j
)
{
T
score
=
box_score_data
[
i
*
class_num
+
j
];
if
(
score
>
max_score
&&
j
>
0
)
{
max_score
=
score
;
max_j
=
j
;
}
}
if
(
max_j
>
0
)
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
output_box_data
[
i
*
class_num
*
4
+
max_j
*
4
+
pno
];
}
}
else
{
for
(
int
pno
=
0
;
pno
<
4
;
pno
++
)
{
output_assign_box_data
[
i
*
4
+
pno
]
=
prior_box_data
[
i
*
4
+
pno
];
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
namespace
paddle
{
namespace
operators
{
class
DistributeFpnProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"FpnRois"
),
"Input(FpnRois) shouldn't be null"
);
PADDLE_ENFORCE_GE
(
ctx
->
Outputs
(
"MultiFpnRois"
).
size
(),
1UL
,
"Outputs(MultiFpnRois) of DistributeOp should not be empty"
);
size_t
min_level
=
static_cast
<
size_t
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"min_level"
));
size_t
max_level
=
static_cast
<
size_t
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"max_level"
));
PADDLE_ENFORCE_GE
(
max_level
,
min_level
,
"max_level must not lower than min_level"
);
// Set the output shape
size_t
num_out_rois
=
max_level
-
min_level
+
1
;
std
::
vector
<
framework
::
DDim
>
outs_dims
;
outs_dims
.
reserve
(
num_out_rois
);
for
(
size_t
i
=
0
;
i
<
num_out_rois
;
++
i
)
{
framework
::
DDim
out_dim
=
{
-
1
,
4
};
outs_dims
.
push_back
(
out_dim
);
}
ctx
->
SetOutputsDim
(
"MultiFpnRois"
,
outs_dims
);
ctx
->
SetOutputDim
(
"RestoreIndex"
,
{
1
,
-
1
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"FpnRois"
));
return
framework
::
OpKernelType
(
data_type
,
platform
::
CPUPlace
());
}
};
class
DistributeFpnProposalsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"FpnRois"
,
"(LoDTensor) The rois at all levels in shape (-1, 4)"
);
AddOutput
(
"MultiFpnRois"
,
"(LoDTensor) Output with distribute operator"
)
.
AsDuplicable
();
AddOutput
(
"RestoreIndex"
,
"(Tensor) An array of positive number which is "
"used to restore the order of FpnRois"
);
AddAttr
<
int
>
(
"min_level"
,
"The lowest level of FPN layer where the"
" proposals come from"
);
AddAttr
<
int
>
(
"max_level"
,
"The highest level of FPN layer where the"
" proposals come from"
);
AddAttr
<
int
>
(
"refer_level"
,
"The referring level of FPN layer with"
" specified scale"
);
AddAttr
<
int
>
(
"refer_scale"
,
"The referring scale of FPN layer with"
" specified level"
);
AddComment
(
R"DOC(
This operator distribute all proposals into different fpn level,
with respect to scale of the proposals, the referring scale and
the referring level. Besides, to restore the order of proposals,
we return an array which indicate the original index of rois in
current proposals.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
distribute_fpn_proposals
,
ops
::
DistributeFpnProposalsOp
,
ops
::
DistributeFpnProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
distribute_fpn_proposals
,
ops
::
DistributeFpnProposalsOpKernel
<
float
>
,
ops
::
DistributeFpnProposalsOpKernel
<
double
>
);
paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include "cub/cub.cuh"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaxinumNumBlocks
=
4096
;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int
const
BBoxSize
=
4
;
struct
RangeInitFunctor
{
int
start_
;
int
delta_
;
int
*
out_
;
__device__
void
operator
()(
size_t
i
)
{
out_
[
i
]
=
start_
+
i
*
delta_
;
}
};
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaxinumNumBlocks
);
}
static
inline
void
TransLoD
(
const
int
*
length_lod
,
const
int
lod_size
,
int
*
offset_lod
)
{
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
lod_size
;
++
i
)
{
offset_lod
[
i
]
=
offset
;
offset
+=
length_lod
[
i
];
}
}
template
<
typename
T
>
static
__device__
inline
T
RoIArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
class
T
>
static
__global__
void
GPUDistFpnProposalsHelper
(
const
int
nthreads
,
const
T
*
rois
,
const
int
lod_size
,
const
int
refer_level
,
const
int
refer_scale
,
const
int
max_level
,
const
int
min_level
,
int
*
roi_batch_id_data
,
int
*
sub_lod_list
,
int
*
target_lvls
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
nthreads
)
{
const
T
*
offset_roi
=
rois
+
i
*
BBoxSize
;
int
roi_batch_ind
=
roi_batch_id_data
[
i
];
// get the target level of current rois
T
roi_area
=
RoIArea
(
offset_roi
,
false
);
T
roi_scale
=
sqrt
(
roi_area
);
int
tgt_lvl
=
floor
(
log2
(
roi_scale
/
refer_scale
)
+
refer_level
);
tgt_lvl
=
min
(
max_level
,
max
(
tgt_lvl
,
min_level
));
target_lvls
[
i
]
=
tgt_lvl
;
// compute number of rois in the same batch and same target level
platform
::
CudaAtomicAdd
(
sub_lod_list
+
tgt_lvl
*
lod_size
+
roi_batch_ind
,
1
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
GPUDistributeFpnProposalsOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
fpn_rois
=
ctx
.
Input
<
paddle
::
framework
::
LoDTensor
>
(
"FpnRois"
);
auto
multi_fpn_rois
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"MultiFpnRois"
);
auto
*
restore_index
=
ctx
.
Output
<
Tensor
>
(
"RestoreIndex"
);
const
int
min_level
=
ctx
.
Attr
<
int
>
(
"min_level"
);
const
int
max_level
=
ctx
.
Attr
<
int
>
(
"max_level"
);
const
int
refer_level
=
ctx
.
Attr
<
int
>
(
"refer_level"
);
const
int
refer_scale
=
ctx
.
Attr
<
int
>
(
"refer_scale"
);
int
num_level
=
max_level
-
min_level
+
1
;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ
(
fpn_rois
->
lod
().
size
(),
1UL
,
"DistributeFpnProposalsOp need 1 level of LoD"
);
auto
fpn_rois_lod
=
fpn_rois
->
lod
().
back
();
int
lod_size
=
fpn_rois_lod
.
size
()
-
1
;
int
roi_num
=
fpn_rois_lod
[
lod_size
];
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
// get batch id by lod in CPU
Tensor
roi_batch_id_list
;
roi_batch_id_list
.
Resize
({
roi_num
});
int
*
roi_batch_id_data
=
roi_batch_id_list
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
n
=
0
;
n
<
lod_size
;
++
n
)
{
for
(
size_t
i
=
fpn_rois_lod
[
n
];
i
<
fpn_rois_lod
[
n
+
1
];
++
i
)
{
roi_batch_id_data
[
i
]
=
n
;
}
}
// copy batch id list to GPU
Tensor
roi_batch_id_list_gpu
;
framework
::
TensorCopySync
(
roi_batch_id_list
,
dev_ctx
.
GetPlace
(),
&
roi_batch_id_list_gpu
);
Tensor
sub_lod_list
;
sub_lod_list
.
Resize
({
num_level
,
lod_size
});
int
*
sub_lod_list_data
=
sub_lod_list
.
mutable_data
<
int
>
(
dev_ctx
.
GetPlace
());
Tensor
target_lvls
;
target_lvls
.
Resize
({
roi_num
});
int
*
target_lvls_data
=
target_lvls
.
mutable_data
<
int
>
(
dev_ctx
.
GetPlace
());
int
blocks
=
NumBlocks
(
roi_num
);
int
threads
=
kNumCUDAThreads
;
// get target levels and sub_lod list
GPUDistFpnProposalsHelper
<
T
><<<
blocks
,
threads
>>>
(
roi_num
,
fpn_rois
->
data
<
T
>
(),
lod_size
,
refer_level
,
refer_scale
,
max_level
,
min_level
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
sub_lod_list_data
,
target_lvls_data
);
Tensor
index_in_t
;
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
dev_ctx
,
roi_num
);
for_range
(
RangeInitFunctor
{
0
,
1
,
idx_in
});
Tensor
keys_out_t
;
int
*
keys_out
=
keys_out_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
Tensor
index_out_t
;
int
*
idx_out
=
index_out_t
.
mutable_data
<
int
>
({
roi_num
},
dev_ctx
.
GetPlace
());
// Determine temporary device storage requirements
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
nullptr
,
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
// Allocate temporary storage
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx
.
GetPlace
());
auto
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
,
memory
::
Allocator
::
kScratchpad
);
// Run sorting operation
// sort target level to get corresponding index
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
target_lvls_data
,
keys_out
,
idx_in
,
idx_out
,
roi_num
);
int
*
restore_idx_data
=
restore_index
->
mutable_data
<
int
>
({
roi_num
,
1
},
dev_ctx
.
GetPlace
());
// sort current index to get restore index
cub
::
DeviceRadixSort
::
SortPairsDescending
<
int
,
int
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
idx_out
,
keys_out
,
idx_in
,
restore_idx_data
,
roi_num
);
Tensor
offset_lod
;
int
*
offset_lod_data
=
offset_lod
.
mutable_data
<
int
>
({
lod_size
+
1
},
dev_ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
Tensor
sub_lod
=
sub_lod_list
.
Slice
(
i
,
i
+
1
);
int
*
sub_lod_data
=
sub_lod
.
data
<
int
>
();
// transfer length-based lod to offset-based lod
TransLoD
(
sub_lod_data
,
lod_size
+
1
,
offset_lod_data
);
int
sub_rois_num
=
offset_lod_data
[
lod_size
];
Tensor
sub_idx
=
index_out_t
.
Slice
(
0
,
sub_rois_num
);
multi_fpn_rois
[
i
]
->
mutable_data
<
T
>
({
sub_rois_num
,
kBoxDim
},
dev_ctx
.
GetPlace
());
GPUGather
<
T
>
(
dev_ctx
,
*
fpn_rois
,
sub_idx
,
multi_fpn_rois
[
i
]);
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
offset
;
memory
::
Copy
(
platform
::
CPUPlace
(),
offset
.
data
(),
place
,
offset_lod_data
,
sizeof
(
int
)
*
(
lod_size
+
1
),
0
);
lod
.
emplace_back
(
offset
);
multi_fpn_rois
[
i
]
->
set_lod
(
lod
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
distribute_fpn_proposals
,
ops
::
GPUDistributeFpnProposalsOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GPUDistributeFpnProposalsOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
const
int
kBoxDim
=
4
;
template
<
typename
T
>
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
typename
T
>
class
DistributeFpnProposalsOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
fpn_rois
=
context
.
Input
<
paddle
::
framework
::
LoDTensor
>
(
"FpnRois"
);
auto
multi_fpn_rois
=
context
.
MultiOutput
<
paddle
::
framework
::
LoDTensor
>
(
"MultiFpnRois"
);
auto
*
restore_index
=
context
.
Output
<
paddle
::
framework
::
Tensor
>
(
"RestoreIndex"
);
const
int
min_level
=
context
.
Attr
<
int
>
(
"min_level"
);
const
int
max_level
=
context
.
Attr
<
int
>
(
"max_level"
);
const
int
refer_level
=
context
.
Attr
<
int
>
(
"refer_level"
);
const
int
refer_scale
=
context
.
Attr
<
int
>
(
"refer_scale"
);
const
int
num_level
=
max_level
-
min_level
+
1
;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ
(
fpn_rois
->
lod
().
size
(),
1UL
,
"DistributeFpnProposalsOp need 1 level of LoD"
);
auto
fpn_rois_lod
=
fpn_rois
->
lod
().
back
();
int
fpn_rois_num
=
fpn_rois_lod
[
fpn_rois_lod
.
size
()
-
1
];
std
::
vector
<
int
>
target_level
;
// std::vector<int> target_level(fpn_rois_num, -1);
// record the number of rois in each level
std
::
vector
<
int
>
num_rois_level
(
num_level
,
0
);
std
::
vector
<
int
>
num_rois_level_integral
(
num_level
+
1
,
0
);
for
(
int
i
=
0
;
i
<
fpn_rois_lod
.
size
()
-
1
;
++
i
)
{
Tensor
fpn_rois_slice
=
fpn_rois
->
Slice
(
fpn_rois_lod
[
i
],
fpn_rois_lod
[
i
+
1
]);
const
T
*
rois_data
=
fpn_rois_slice
.
data
<
T
>
();
for
(
int
j
=
0
;
j
<
fpn_rois_slice
.
dims
()[
0
];
++
j
)
{
// get the target level of current rois
T
roi_scale
=
std
::
sqrt
(
BBoxArea
(
rois_data
,
false
));
int
tgt_lvl
=
std
::
floor
(
std
::
log2
(
roi_scale
/
refer_scale
)
+
refer_level
);
tgt_lvl
=
std
::
min
(
max_level
,
std
::
max
(
tgt_lvl
,
min_level
));
target_level
.
push_back
(
tgt_lvl
);
num_rois_level
[
tgt_lvl
-
min_level
]
++
;
rois_data
+=
kBoxDim
;
}
}
// define the output rois
// pointer which point to each level fpn rois
std
::
vector
<
T
*>
multi_fpn_rois_data
(
num_level
);
// lod0 which will record the offset information of each level rois
std
::
vector
<
std
::
vector
<
size_t
>>
multi_fpn_rois_lod0
;
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
// allocate memory for each level rois
multi_fpn_rois
[
i
]
->
mutable_data
<
T
>
({
num_rois_level
[
i
],
kBoxDim
},
context
.
GetPlace
());
multi_fpn_rois_data
[
i
]
=
multi_fpn_rois
[
i
]
->
data
<
T
>
();
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
multi_fpn_rois_lod0
.
push_back
(
lod0
);
// statistic start point for each level rois
num_rois_level_integral
[
i
+
1
]
=
num_rois_level_integral
[
i
]
+
num_rois_level
[
i
];
}
restore_index
->
mutable_data
<
int
>
({
1
,
fpn_rois_num
},
context
.
GetPlace
());
int
*
restore_index_data
=
restore_index
->
data
<
int
>
();
std
::
vector
<
int
>
restore_index_inter
(
fpn_rois_num
,
-
1
);
// distribute the rois into different fpn level by target level
for
(
int
i
=
0
;
i
<
fpn_rois_lod
.
size
()
-
1
;
++
i
)
{
Tensor
fpn_rois_slice
=
fpn_rois
->
Slice
(
fpn_rois_lod
[
i
],
fpn_rois_lod
[
i
+
1
]);
const
T
*
rois_data
=
fpn_rois_slice
.
data
<
T
>
();
size_t
cur_offset
=
fpn_rois_lod
[
i
];
// std::vector<size_t > lod_offset[num_level];
for
(
int
j
=
0
;
j
<
num_level
;
j
++
)
{
multi_fpn_rois_lod0
[
j
].
push_back
(
multi_fpn_rois_lod0
[
j
][
i
]);
}
for
(
int
j
=
0
;
j
<
fpn_rois_slice
.
dims
()[
0
];
++
j
)
{
int
lvl
=
target_level
[
cur_offset
+
j
];
memcpy
(
multi_fpn_rois_data
[
lvl
-
min_level
],
rois_data
,
kBoxDim
*
sizeof
(
T
));
multi_fpn_rois_data
[
lvl
-
min_level
]
+=
kBoxDim
;
int
index_in_shuffle
=
num_rois_level_integral
[
lvl
-
min_level
]
+
multi_fpn_rois_lod0
[
lvl
-
min_level
][
i
+
1
];
restore_index_inter
[
index_in_shuffle
]
=
cur_offset
+
j
;
multi_fpn_rois_lod0
[
lvl
-
min_level
][
i
+
1
]
++
;
rois_data
+=
kBoxDim
;
}
}
for
(
int
i
=
0
;
i
<
fpn_rois_num
;
++
i
)
{
restore_index_data
[
restore_index_inter
[
i
]]
=
i
;
}
// merge lod information into LoDTensor
for
(
int
i
=
0
;
i
<
num_level
;
++
i
)
{
framework
::
LoD
lod
;
lod
.
emplace_back
(
multi_fpn_rois_lod0
[
i
]);
multi_fpn_rois
[
i
]
->
set_lod
(
lod
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
2c4fcaa6
...
@@ -22,7 +22,6 @@ limitations under the License. */
...
@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
...
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
PADDLE_ENFORCE_LE
(
table_width
*
idx_width
,
out_width
);
PADDLE_ENFORCE_LE
(
table_width
*
idx_width
,
out_width
);
PADDLE_ENFORCE_GT
(
ids_lod
.
size
(),
1UL
);
PADDLE_ENFORCE_GT
(
ids_lod
.
size
(),
1UL
,
"The LoD[0] could NOT be empty"
);
jit
::
emb_seq_pool_attr_t
attr
(
table_height
,
table_width
,
0
,
idx_width
,
jit
::
emb_seq_pool_attr_t
attr
(
table_height
,
table_width
,
0
,
idx_width
,
out_width
,
jit
::
SeqPoolType
::
kSum
);
out_width
,
jit
::
SeqPoolType
::
kSum
);
...
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
...
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
FusedEmbeddingSeqPoolLastDim
(
table_var
->
dims
(),
ids_t
->
dims
());
FusedEmbeddingSeqPoolLastDim
(
table_var
->
dims
(),
ids_t
->
dims
());
const
auto
&
ids_lod
=
ids_t
->
lod
();
const
auto
&
ids_lod
=
ids_t
->
lod
();
// in run time, the LoD of ids must be 1
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1
u
,
"The LoD level of Input(Ids) must be 1"
);
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1
UL
,
PADDLE_ENFORCE_GE
(
ids_lod
[
0
].
size
(),
1u
,
"The LoD could NOT be empty
"
);
"The LoD level of Input(Ids) must be 1
"
);
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
// in run time, the shape from Ids -> output
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size,
embedding_size
]
// should be [seq_length, 1] -> [batch_size,
last_dim
]
output_t
->
Resize
({
batch_size
,
last_dim
});
output_t
->
Resize
({
batch_size
,
last_dim
});
if
(
combiner_type
==
"sum"
)
{
if
(
combiner_type
==
"sum"
)
{
...
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
...
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
int64_t
ids_num
=
ids
->
numel
();
auto
lod
=
ids
->
lod
()[
0
];
auto
lod
=
ids
->
lod
()[
0
];
int64_t
row
_width
=
d_output
->
dims
()[
1
];
int64_t
out
_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
new_rows
->
resize
(
ids_num
);
...
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
...
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
auto
vbroadcast
=
jit
::
Get
<
jit
::
kVBroadcast
,
jit
::
VBroadcastTuples
<
T
>
,
platform
::
CPUPlace
>
(
out_width
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
in_offset
=
lod
[
i
]
*
row_width
;
const
T
*
src
=
d_output_data
+
i
*
out_width
;
const
T
*
out_pos
=
d_output_data
+
i
*
row_width
;
T
*
dst
=
d_table_data
+
lod
[
i
]
*
out_width
;
T
*
in_pos
=
d_table_data
+
in_offset
;
vbroadcast
(
src
,
dst
,
h
,
out_width
);
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
row_width
,
out_pos
,
in_pos
+
r
*
row_width
);
}
}
}
}
else
{
}
else
{
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
...
...
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
2c4fcaa6
...
@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
...
@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchVBroadcastKernel
()
{
for
(
int64_t
w
:
{
1
,
16
,
64
,
100
,
256
})
{
Tensor
x
;
x
.
Resize
({
w
});
RandomVec
<
T
>
(
w
,
x
.
mutable_data
<
T
>
(
PlaceType
()));
const
T
*
x_data
=
x
.
data
<
T
>
();
for
(
int
h
:
TestSizes
())
{
Tensor
y
;
y
.
Resize
({
h
*
w
});
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
VBroadcastTuples
<
T
>
,
PlaceType
>
(
w
,
x_data
,
y_data
,
static_cast
<
int64_t
>
(
h
),
w
);
}
}
}
using
T
=
float
;
using
T
=
float
;
using
CPUPlace
=
paddle
::
platform
::
CPUPlace
;
using
CPUPlace
=
paddle
::
platform
::
CPUPlace
;
...
@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
...
@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU
(
kVExp
)
{
BenchXYNKernel
<
jit
::
kVExp
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVExp
)
{
BenchXYNKernel
<
jit
::
kVExp
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSigmoid
)
{
BenchXYNKernel
<
jit
::
kVSigmoid
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSigmoid
)
{
BenchXYNKernel
<
jit
::
kVSigmoid
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVTanh
)
{
BenchXYNKernel
<
jit
::
kVTanh
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVTanh
)
{
BenchXYNKernel
<
jit
::
kVTanh
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVCopy
)
{
BenchXYNKernel
<
jit
::
kVCopy
,
T
,
CPUPlace
>
();
}
// lstm and peephole
// lstm and peephole
BENCH_FP32_CPU
(
kLSTMCtHt
)
{
BenchLSTMKernel
<
jit
::
kLSTMCtHt
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kLSTMCtHt
)
{
BenchLSTMKernel
<
jit
::
kLSTMCtHt
,
T
,
CPUPlace
>
();
}
...
@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
...
@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel
<
jit
::
kCRFDecoding
,
T
,
CPUPlace
>
();
BenchCRFDecodingKernel
<
jit
::
kCRFDecoding
,
T
,
CPUPlace
>
();
}
}
// vbroadcast function
BENCH_FP32_CPU
(
kVBroadcast
)
{
BenchVBroadcastKernel
<
jit
::
kVBroadcast
,
T
,
CPUPlace
>
();
}
// Benchmark all jit kernels including jitcode, mkl and refer.
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// To use this tool, run command: ./benchmark [options...]
// Options:
// Options:
...
...
paddle/fluid/operators/jit/gen/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
...
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN
(
kHSum
)
USE_JITKERNEL_GEN
(
kHSum
)
USE_JITKERNEL_GEN
(
kEmbSeqPool
)
USE_JITKERNEL_GEN
(
kEmbSeqPool
)
USE_JITKERNEL_GEN
(
kSgd
)
USE_JITKERNEL_GEN
(
kSgd
)
USE_JITKERNEL_GEN
(
kVBroadcast
)
paddle/fluid/operators/jit/gen/vbroadcast.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
void
VBroadcastJitCode
::
genCode
()
{
preCode
();
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
constexpr
int
max_num_regs
=
16
;
const
int
num_block
=
w_
/
block
;
const
int
num_groups
=
num_block
/
max_num_regs
;
const
size_t
block_size
=
sizeof
(
float
)
*
block
;
std
::
vector
<
int
>
groups
(
num_groups
,
max_num_regs
);
int
rest_num_regs
=
num_block
%
max_num_regs
;
if
(
rest_num_regs
>
0
)
{
groups
.
push_back
(
rest_num_regs
);
}
// protect param_h
mov
(
reg_height
,
param_h
);
Label
l_next_h
;
xor_
(
reg_h_i
,
reg_h_i
);
mov
(
reg_ptr_dst_i
,
param_dst
);
L
(
l_next_h
);
{
mov
(
reg_ptr_src_i
,
param_src
);
for
(
int
num_regs
:
groups
)
{
size_t
w_offset
=
0
;
for
(
int
reg_i
=
0
;
reg_i
<
num_regs
;
++
reg_i
)
{
vmovups
(
ymm_t
(
reg_i
),
ptr
[
reg_ptr_src_i
+
w_offset
]);
w_offset
+=
block_size
;
}
add
(
reg_ptr_src_i
,
num_regs
*
block_size
);
w_offset
=
0
;
for
(
int
reg_i
=
0
;
reg_i
<
num_regs
;
++
reg_i
)
{
vmovups
(
ptr
[
reg_ptr_dst_i
+
w_offset
],
ymm_t
(
reg_i
));
w_offset
+=
block_size
;
}
add
(
reg_ptr_dst_i
,
num_regs
*
block_size
);
}
// end of groups
inc
(
reg_h_i
);
cmp
(
reg_h_i
,
reg_height
);
jl
(
l_next_h
,
T_NEAR
);
}
// end of l_next_h
postCode
();
}
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
public:
bool
UseMe
(
const
int64_t
&
w
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
}
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int64_t
&
w
)
const
override
{
PADDLE_ENFORCE_GT
(
w
,
0
);
return
make_unique
<
VBroadcastJitCode
>
(
w
,
CodeSize
(
w
));
}
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
namespace
gen
=
paddle
::
operators
::
jit
::
gen
;
REGISTER_JITKERNEL_GEN
(
kVBroadcast
,
gen
::
VBroadcastCreator
);
paddle/fluid/operators/jit/gen/vbroadcast.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
class
VBroadcastJitCode
:
public
JitCode
{
public:
explicit
VBroadcastJitCode
(
const
int64_t
&
w
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
w_
(
w
)
{
this
->
genCode
();
}
DECLARE_JIT_CODE
(
VBroadcastJitCode
);
void
genCode
()
override
;
private:
int
w_
;
reg64_t
param_src
{
abi_param1
};
reg64_t
param_dst
{
abi_param2
};
reg64_t
param_h
{
abi_param3
};
reg64_t
param_w
{
abi_param4
};
reg64_t
reg_height
{
r9
};
reg64_t
reg_h_i
{
r10
};
reg64_t
reg_ptr_src_i
{
r11
};
reg64_t
reg_ptr_dst_i
{
r12
};
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/helper.cc
浏览文件 @
2c4fcaa6
...
@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
...
@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kVScal
);
ONE_CASE
(
kVScal
);
ONE_CASE
(
kVAddBias
);
ONE_CASE
(
kVAddBias
);
ONE_CASE
(
kVRelu
);
ONE_CASE
(
kVRelu
);
ONE_CASE
(
kVBroadcast
);
ONE_CASE
(
kVCopy
);
ONE_CASE
(
kVIdentity
);
ONE_CASE
(
kVIdentity
);
ONE_CASE
(
kVExp
);
ONE_CASE
(
kVExp
);
ONE_CASE
(
kVSquare
);
ONE_CASE
(
kVSquare
);
...
...
paddle/fluid/operators/jit/kernel_base.h
浏览文件 @
2c4fcaa6
...
@@ -41,6 +41,8 @@ typedef enum {
...
@@ -41,6 +41,8 @@ typedef enum {
kVAdd
,
kVAdd
,
kVAddBias
,
kVAddBias
,
kVAddRelu
,
kVAddRelu
,
kVBroadcast
,
kVCopy
,
kVExp
,
kVExp
,
kVIdentity
,
kVIdentity
,
kVMul
,
kVMul
,
...
@@ -133,6 +135,13 @@ struct GRUTuples {
...
@@ -133,6 +135,13 @@ struct GRUTuples {
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
};
};
template
<
typename
T
>
struct
VBroadcastTuples
{
typedef
T
data_type
;
typedef
int64_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int64_t
,
int64_t
);
};
typedef
struct
seq_pool_attr_s
{
typedef
struct
seq_pool_attr_s
{
int
h
,
w
;
// h should always be the first one
int
h
,
w
;
// h should always be the first one
SeqPoolType
type
;
SeqPoolType
type
;
...
...
paddle/fluid/operators/jit/kernel_key.cc
浏览文件 @
2c4fcaa6
...
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
...
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
return
d
;
return
d
;
}
}
template
<
>
size_t
JitCodeKey
<
int64_t
>
(
const
int64_t
&
d
)
{
return
d
;
}
// TODO(TJ): refine and benchmark JitCodeKey generatation
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr
int
act_type_shift
=
3
;
// suppot 2^3 act types
constexpr
int
act_type_shift
=
3
;
// suppot 2^3 act types
static
inline
int
act_type_convert
(
KernelType
type
)
{
static
inline
int
act_type_convert
(
KernelType
type
)
{
...
...
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
...
@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
USE_JITKERNEL_MORE
(
kVScal, mkl
)
USE_JITKERNEL_MORE
(
kVScal, mkl
)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVSquare, mkl
)
USE_JITKERNEL_MORE
(
kVSquare, mkl
)
USE_JITKERNEL_MORE
(
kVCopy, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kSoftmax, mkl
)
USE_JITKERNEL_MORE
(
kSoftmax, mkl
)
USE_JITKERNEL_MORE
(
kEmbSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kEmbSeqPool, mkl
)
USE_JITKERNEL_MORE
(
kSgd, mkl
)
USE_JITKERNEL_MORE
(
kSgd, mkl
)
USE_JITKERNEL_MORE
(
kVBroadcast, mkl
)
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
2c4fcaa6
...
@@ -154,6 +154,21 @@ bool VSquareKernel<float>::UseMe(const int& d) const {
...
@@ -154,6 +154,21 @@ bool VSquareKernel<float>::UseMe(const int& d) const {
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
bool
VCopyKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
return
d
>
15
;
}
template
<
>
bool
VBroadcastKernel
<
float
>::
UseMe
(
const
int64_t
&
d
)
const
{
return
d
>
127
;
}
template
<
>
bool
VBroadcastKernel
<
double
>::
UseMe
(
const
int64_t
&
attr
)
const
{
return
true
;
}
template
<
>
template
<
>
bool
VSigmoidKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VSigmoidKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
return
d
>
7
;
return
d
>
7
;
...
@@ -223,6 +238,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
...
@@ -223,6 +238,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE
(
VSigmoid
);
AWALYS_USE_ME_WITH_DOUBLE
(
VSigmoid
);
AWALYS_USE_ME_WITH_DOUBLE
(
VTanh
);
AWALYS_USE_ME_WITH_DOUBLE
(
VTanh
);
AWALYS_USE_ME_WITH_DOUBLE
(
VSquare
);
AWALYS_USE_ME_WITH_DOUBLE
(
VSquare
);
AWALYS_USE_ME_WITH_DOUBLE
(
VCopy
);
AWALYS_USE_ME_WITH_DOUBLE
(
Softmax
);
AWALYS_USE_ME_WITH_DOUBLE
(
Softmax
);
#undef AWALYS_USE_ME_WITH_DOUBLE
#undef AWALYS_USE_ME_WITH_DOUBLE
...
@@ -244,6 +260,8 @@ REGISTER_MKL_KERNEL(kVAdd, VAdd);
...
@@ -244,6 +260,8 @@ REGISTER_MKL_KERNEL(kVAdd, VAdd);
REGISTER_MKL_KERNEL
(
kVScal
,
VScal
);
REGISTER_MKL_KERNEL
(
kVScal
,
VScal
);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_MKL_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_MKL_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_MKL_KERNEL
(
kVBroadcast
,
VBroadcast
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
...
...
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
2c4fcaa6
...
@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
...
@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
template
<
typename
T
>
template
<
typename
T
>
void
VAXPY
(
T
a
,
const
T
*
x
,
T
*
y
,
int
n
);
void
VAXPY
(
T
a
,
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VBroadcast
(
const
T
*
x
,
T
*
y
,
int64_t
y_h
,
int64_t
x_len
)
{
for
(
int64_t
h
=
0
;
h
<
y_h
;
++
h
)
{
VCopy
(
x
,
y
+
h
*
x_len
,
x_len
);
}
}
template
<
typename
T
>
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
...
@@ -192,6 +199,7 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
...
@@ -192,6 +199,7 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VCopy
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
...
@@ -201,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
...
@@ -201,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
DECLARE_MKL_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_MKL_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_MKL_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_MKL_KERNEL
#undef DECLARE_MKL_KERNEL
}
// namespace mkl
}
// namespace mkl
...
...
paddle/fluid/operators/jit/refer/CMakeLists.txt
浏览文件 @
2c4fcaa6
...
@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
...
@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
USE_JITKERNEL_REFER
(
kVSub
)
USE_JITKERNEL_REFER
(
kVSub
)
USE_JITKERNEL_REFER
(
kVScal
)
USE_JITKERNEL_REFER
(
kVScal
)
USE_JITKERNEL_REFER
(
kVAddBias
)
USE_JITKERNEL_REFER
(
kVAddBias
)
USE_JITKERNEL_REFER
(
kVCopy
)
USE_JITKERNEL_REFER
(
kVRelu
)
USE_JITKERNEL_REFER
(
kVRelu
)
USE_JITKERNEL_REFER
(
kVIdentity
)
USE_JITKERNEL_REFER
(
kVIdentity
)
USE_JITKERNEL_REFER
(
kVExp
)
USE_JITKERNEL_REFER
(
kVExp
)
...
@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
...
@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER
(
kSoftmax
)
USE_JITKERNEL_REFER
(
kSoftmax
)
USE_JITKERNEL_REFER
(
kEmbSeqPool
)
USE_JITKERNEL_REFER
(
kEmbSeqPool
)
USE_JITKERNEL_REFER
(
kSgd
)
USE_JITKERNEL_REFER
(
kSgd
)
USE_JITKERNEL_REFER
(
kVBroadcast
)
paddle/fluid/operators/jit/refer/refer.cc
浏览文件 @
2c4fcaa6
...
@@ -30,6 +30,7 @@ REGISTER_REFER_KERNEL(kVScal, VScal);
...
@@ -30,6 +30,7 @@ REGISTER_REFER_KERNEL(kVScal, VScal);
REGISTER_REFER_KERNEL
(
kVAddBias
,
VAddBias
);
REGISTER_REFER_KERNEL
(
kVAddBias
,
VAddBias
);
REGISTER_REFER_KERNEL
(
kVRelu
,
VRelu
);
REGISTER_REFER_KERNEL
(
kVRelu
,
VRelu
);
REGISTER_REFER_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_REFER_KERNEL
(
kVIdentity
,
VIdentity
);
REGISTER_REFER_KERNEL
(
kVIdentity
,
VIdentity
);
REGISTER_REFER_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_REFER_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_REFER_KERNEL
(
kVExp
,
VExp
);
REGISTER_REFER_KERNEL
(
kVExp
,
VExp
);
...
@@ -61,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
...
@@ -61,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
REGISTER_REFER_KERNEL
(
kSgd
,
Sgd
);
REGISTER_REFER_KERNEL
(
kSgd
,
Sgd
);
REGISTER_REFER_KERNEL
(
kVBroadcast
,
VBroadcast
);
#undef REGISTER_REFER_KERNEL
#undef REGISTER_REFER_KERNEL
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
2c4fcaa6
...
@@ -70,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) {
...
@@ -70,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) {
}
}
}
}
template
<
typename
T
>
void
VCopy
(
const
T
*
x
,
T
*
y
,
int
n
)
{
std
::
memcpy
(
y
,
x
,
n
*
sizeof
(
T
));
}
// x shape: (x_len)
// y shape: (h, x_len)
template
<
typename
T
>
void
VBroadcast
(
const
T
*
x
,
T
*
y
,
int64_t
y_h
,
int64_t
x_len
)
{
for
(
int64_t
h
=
0
;
h
<
y_h
;
++
h
)
{
VCopy
(
x
,
y
+
h
*
x_len
,
x_len
);
}
}
template
<
typename
T
>
template
<
typename
T
>
void
VRelu
(
const
T
*
x
,
T
*
y
,
int
n
)
{
void
VRelu
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
@@ -500,6 +514,7 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples);
...
@@ -500,6 +514,7 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples);
DECLARE_REFER_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VCopy
,
XYNTuples
);
// lstm_t*, const lstm_attr_t*
// lstm_t*, const lstm_attr_t*
DECLARE_REFER_KERNEL
(
LSTMCtHt
,
LSTMTuples
);
DECLARE_REFER_KERNEL
(
LSTMCtHt
,
LSTMTuples
);
...
@@ -528,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
...
@@ -528,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
DECLARE_REFER_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_REFER_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_REFER_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_REFER_KERNEL
#undef DECLARE_REFER_KERNEL
}
// namespace refer
}
// namespace refer
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
2c4fcaa6
...
@@ -26,8 +26,8 @@ limitations under the License. */
...
@@ -26,8 +26,8 @@ limitations under the License. */
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
template
<
typename
T
>
template
<
typename
T
>
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
2
0
.
f
),
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
2.
f
),
const
T
upper
=
static_cast
<
T
>
(
2
0
.
f
))
{
const
T
upper
=
static_cast
<
T
>
(
2.
f
))
{
static
unsigned
int
seed
=
100
;
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
...
@@ -157,6 +157,26 @@ struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
...
@@ -157,6 +157,26 @@ struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
}
}
};
};
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
VBroadcastTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>
,
int64_t
,
typename
jit
::
VBroadcastTuples
<
T
>::
attr_type
>
{
void
operator
()(
const
typename
jit
::
VBroadcastTuples
<
T
>::
func_type
tgt
,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
int64_t
h
,
const
typename
jit
::
VBroadcastTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
(),
static_cast
<
size_t
>
(
attr
));
EXPECT_EQ
(
yref
.
size
(),
x
.
size
()
*
h
);
std
::
vector
<
T
>
y
(
yref
.
size
());
const
T
*
x_data
=
x
.
data
();
const
T
*
yref_data
=
yref
.
data
();
T
*
y_data
=
y
.
data
();
tgt
(
x_data
,
y_data
,
h
,
attr
);
ExpectEQ
<
T
>
(
y_data
,
yref_data
,
yref
.
size
());
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
XYNTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
{
struct
TestFuncWithRefer
<
jit
::
XYNTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
{
void
operator
()(
const
typename
jit
::
XYNTuples
<
T
>::
func_type
tgt
,
void
operator
()(
const
typename
jit
::
XYNTuples
<
T
>::
func_type
tgt
,
...
@@ -514,7 +534,7 @@ void TestKernelXRNTuples() {
...
@@ -514,7 +534,7 @@ void TestKernelXRNTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
XRNTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
XRNTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
d
);
std
::
vector
<
T
>
x
(
d
);
RandomVec
<
T
>
(
d
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
x
.
data
());
T
ref_res
;
T
ref_res
;
ref
(
x
.
data
(),
&
ref_res
,
d
);
ref
(
x
.
data
(),
&
ref_res
,
d
);
TestAllImpls
<
KT
,
jit
::
XRNTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
T
>
(
d
,
x
,
TestAllImpls
<
KT
,
jit
::
XRNTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
T
>
(
d
,
x
,
...
@@ -532,7 +552,7 @@ void TestKernelXYNTuples() {
...
@@ -532,7 +552,7 @@ void TestKernelXYNTuples() {
std
::
vector
<
T
>
x
(
d
),
yref
(
d
);
std
::
vector
<
T
>
x
(
d
),
yref
(
d
);
std
::
vector
<
T
>
xinp
(
d
);
// inplace test
std
::
vector
<
T
>
xinp
(
d
);
// inplace test
RandomVec
<
T
>
(
d
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
x
.
data
());
std
::
copy
(
x
.
begin
(),
x
.
end
(),
xinp
.
begin
());
std
::
copy
(
x
.
begin
(),
x
.
end
(),
xinp
.
begin
());
const
T
*
x_data
=
x
.
data
();
const
T
*
x_data
=
x
.
data
();
...
@@ -566,7 +586,7 @@ void TestKernelLSTMTuples() {
...
@@ -566,7 +586,7 @@ void TestKernelLSTMTuples() {
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
xsrc
(
4
*
d
),
wp
(
3
*
d
),
ct_1
(
d
);
std
::
vector
<
T
>
xsrc
(
4
*
d
),
wp
(
3
*
d
),
ct_1
(
d
);
std
::
vector
<
T
>
ct_ref
(
d
),
ht_ref
(
d
),
checked
(
2
*
d
);
std
::
vector
<
T
>
ct_ref
(
d
),
ht_ref
(
d
),
checked
(
2
*
d
);
RandomVec
<
T
>
(
4
*
d
,
xsrc
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
4
*
d
,
xsrc
.
data
());
RandomVec
<
T
>
(
3
*
d
,
wp
.
data
(),
-
1.
f
,
1.
f
);
RandomVec
<
T
>
(
3
*
d
,
wp
.
data
(),
-
1.
f
,
1.
f
);
RandomVec
<
T
>
(
d
,
ct_1
.
data
(),
-
1.
f
,
1.
f
);
RandomVec
<
T
>
(
d
,
ct_1
.
data
(),
-
1.
f
,
1.
f
);
// x could be changed after compute, so copy to save src
// x could be changed after compute, so copy to save src
...
@@ -614,8 +634,8 @@ void TestKernelGRUTuples() {
...
@@ -614,8 +634,8 @@ void TestKernelGRUTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
GRUTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
GRUTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
xsrc
(
3
*
d
),
ht_1
(
d
),
ht_ref
(
d
);
std
::
vector
<
T
>
xsrc
(
3
*
d
),
ht_1
(
d
),
ht_ref
(
d
);
RandomVec
<
T
>
(
3
*
d
,
xsrc
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
3
*
d
,
xsrc
.
data
());
RandomVec
<
T
>
(
d
,
ht_1
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
ht_1
.
data
());
// x could be changed after compute, so copy to save src
// x could be changed after compute, so copy to save src
std
::
vector
<
T
>
x
(
xsrc
.
size
());
std
::
vector
<
T
>
x
(
xsrc
.
size
());
std
::
copy
(
xsrc
.
begin
(),
xsrc
.
end
(),
x
.
begin
());
std
::
copy
(
xsrc
.
begin
(),
xsrc
.
end
(),
x
.
begin
());
...
@@ -651,7 +671,7 @@ void TestKernelSeqPoolTuples() {
...
@@ -651,7 +671,7 @@ void TestKernelSeqPoolTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SeqPoolTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SeqPoolTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
h
*
w
),
yref
(
w
);
std
::
vector
<
T
>
x
(
h
*
w
),
yref
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
const
T
*
x_data
=
x
.
data
();
T
*
yref_data
=
yref
.
data
();
T
*
yref_data
=
yref
.
data
();
ref
(
x_data
,
yref_data
,
&
attr
);
ref
(
x_data
,
yref_data
,
&
attr
);
...
@@ -676,8 +696,8 @@ void TestKernelMatMulTuples() {
...
@@ -676,8 +696,8 @@ void TestKernelMatMulTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
MatMulTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
MatMulTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
a
(
m
*
k
),
b
(
k
*
n
),
c
(
m
*
n
);
std
::
vector
<
T
>
a
(
m
*
k
),
b
(
k
*
n
),
c
(
m
*
n
);
RandomVec
<
T
>
(
m
*
k
,
a
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
m
*
k
,
a
.
data
());
RandomVec
<
T
>
(
k
*
n
,
b
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
k
*
n
,
b
.
data
());
const
T
*
a_data
=
a
.
data
();
const
T
*
a_data
=
a
.
data
();
const
T
*
b_data
=
b
.
data
();
const
T
*
b_data
=
b
.
data
();
T
*
c_data
=
c
.
data
();
T
*
c_data
=
c
.
data
();
...
@@ -699,7 +719,7 @@ void TestKernelSoftmaxTuples() {
...
@@ -699,7 +719,7 @@ void TestKernelSoftmaxTuples() {
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SoftmaxTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SoftmaxTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
bs
*
n
),
y
(
bs
*
n
);
std
::
vector
<
T
>
x
(
bs
*
n
),
y
(
bs
*
n
);
RandomVec
<
T
>
(
bs
*
n
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
bs
*
n
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
const
T
*
x_data
=
x
.
data
();
T
*
y_data
=
y
.
data
();
T
*
y_data
=
y
.
data
();
...
@@ -726,7 +746,7 @@ void TestKernelEmbSeqPoolTuples() {
...
@@ -726,7 +746,7 @@ void TestKernelEmbSeqPoolTuples() {
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
1000
));
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
1000
));
for
(
int
tbl_w
:
test_sizes
)
{
for
(
int
tbl_w
:
test_sizes
)
{
std
::
vector
<
T
>
table
(
tbl_h
*
tbl_w
);
std
::
vector
<
T
>
table
(
tbl_h
*
tbl_w
);
RandomVec
<
T
>
(
tbl_h
*
tbl_w
,
table
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
tbl_h
*
tbl_w
,
table
.
data
());
const
T
*
table_data
=
table
.
data
();
const
T
*
table_data
=
table
.
data
();
for
(
auto
type
:
pool_types
)
{
for
(
auto
type
:
pool_types
)
{
for
(
int
idx_w
:
{
1
,
2
,
10
,
16
})
{
for
(
int
idx_w
:
{
1
,
2
,
10
,
16
})
{
...
@@ -772,14 +792,14 @@ void TestKernelSgdTuples() {
...
@@ -772,14 +792,14 @@ void TestKernelSgdTuples() {
for
(
int
grad_w
:
TestSizes
())
{
for
(
int
grad_w
:
TestSizes
())
{
std
::
vector
<
T
>
param
(
param_h
*
grad_w
);
std
::
vector
<
T
>
param
(
param_h
*
grad_w
);
std
::
vector
<
T
>
param_out
(
param_h
*
grad_w
);
std
::
vector
<
T
>
param_out
(
param_h
*
grad_w
);
RandomVec
<
T
>
(
param_h
*
grad_w
,
param
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
param_h
*
grad_w
,
param
.
data
());
const
T
*
param_data
=
param
.
data
();
const
T
*
param_data
=
param
.
data
();
T
*
out_data
=
param_out
.
data
();
T
*
out_data
=
param_out
.
data
();
for
(
int
rows_size
=
1
;
rows_size
<=
param_h
;
++
rows_size
)
{
for
(
int
rows_size
=
1
;
rows_size
<=
param_h
;
++
rows_size
)
{
std
::
vector
<
T
>
grad
(
rows_size
*
grad_w
);
std
::
vector
<
T
>
grad
(
rows_size
*
grad_w
);
std
::
vector
<
int64_t
>
rows
=
std
::
vector
<
int64_t
>
rows
=
UnDuplicatedRandomVec
(
rows_size
,
0
,
rows_size
-
1
);
UnDuplicatedRandomVec
(
rows_size
,
0
,
rows_size
-
1
);
RandomVec
<
T
>
(
rows_size
*
grad_w
,
grad
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
rows_size
*
grad_w
,
grad
.
data
());
const
int64_t
*
rows_data
=
rows
.
data
();
const
int64_t
*
rows_data
=
rows
.
data
();
const
T
*
grad_data
=
grad
.
data
();
const
T
*
grad_data
=
grad
.
data
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SgdTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SgdTuples
<
T
>>
();
...
@@ -815,8 +835,8 @@ void TestKernelNCHW16CMulNCTuples() {
...
@@ -815,8 +835,8 @@ void TestKernelNCHW16CMulNCTuples() {
int
sz
=
n
*
c
*
h
*
w
;
int
sz
=
n
*
c
*
h
*
w
;
std
::
vector
<
T
>
x
(
sz
),
y
(
n
*
c
),
zref
(
sz
);
std
::
vector
<
T
>
x
(
sz
),
y
(
n
*
c
),
zref
(
sz
);
std
::
vector
<
T
>
ztgt
(
sz
),
zjit
(
sz
);
std
::
vector
<
T
>
ztgt
(
sz
),
zjit
(
sz
);
RandomVec
<
T
>
(
sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
sz
,
x
.
data
());
RandomVec
<
T
>
(
n
*
c
,
y
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
n
*
c
,
y
.
data
());
const
T
*
x_data
=
x
.
data
();
const
T
*
x_data
=
x
.
data
();
const
T
*
y_data
=
y
.
data
();
const
T
*
y_data
=
y
.
data
();
...
@@ -873,11 +893,11 @@ void TestKernelLayerNormTuples() {
...
@@ -873,11 +893,11 @@ void TestKernelLayerNormTuples() {
int
sz
=
left
*
right
;
int
sz
=
left
*
right
;
std
::
vector
<
T
>
x
(
sz
),
mean
(
left
),
var
(
left
),
scale
(
right
),
bias
(
right
),
std
::
vector
<
T
>
x
(
sz
),
mean
(
left
),
var
(
left
),
scale
(
right
),
bias
(
right
),
outref
(
sz
);
outref
(
sz
);
RandomVec
<
T
>
(
sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
sz
,
x
.
data
());
RandomVec
<
T
>
(
left
,
mean
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
left
,
mean
.
data
());
RandomVec
<
T
>
(
left
,
var
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
left
,
var
.
data
());
RandomVec
<
T
>
(
right
,
scale
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
right
,
scale
.
data
());
RandomVec
<
T
>
(
right
,
bias
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
right
,
bias
.
data
());
const
T
*
scale_data
=
scale
.
data
();
const
T
*
scale_data
=
scale
.
data
();
const
T
*
bias_data
=
bias
.
data
();
const
T
*
bias_data
=
bias
.
data
();
...
@@ -903,7 +923,7 @@ void TestKernelCRFDecodingTuples() {
...
@@ -903,7 +923,7 @@ void TestKernelCRFDecodingTuples() {
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
constexpr
int
state_trans_base_idx
=
2
;
constexpr
int
state_trans_base_idx
=
2
;
auto
test_sizes
=
TestSizes
();
auto
test_sizes
=
TestSizes
();
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
1
000
));
test_sizes
.
erase
(
std
::
remove
(
test_sizes
.
begin
(),
test_sizes
.
end
(),
2
000
));
for
(
int
seq_len
:
{
1
,
11
,
17
,
50
})
{
for
(
int
seq_len
:
{
1
,
11
,
17
,
50
})
{
for
(
int
tag_num
:
test_sizes
)
{
for
(
int
tag_num
:
test_sizes
)
{
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
CRFDecodingTuples
<
T
>>
();
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
CRFDecodingTuples
<
T
>>
();
...
@@ -912,8 +932,8 @@ void TestKernelCRFDecodingTuples() {
...
@@ -912,8 +932,8 @@ void TestKernelCRFDecodingTuples() {
int
w_sz
=
(
tag_num
+
state_trans_base_idx
)
*
tag_num
;
int
w_sz
=
(
tag_num
+
state_trans_base_idx
)
*
tag_num
;
std
::
vector
<
T
>
x
(
x_sz
),
w
(
w_sz
),
alpharef
(
x_sz
);
std
::
vector
<
T
>
x
(
x_sz
),
w
(
w_sz
),
alpharef
(
x_sz
);
std
::
vector
<
int
>
trackref
(
x_sz
);
std
::
vector
<
int
>
trackref
(
x_sz
);
RandomVec
<
T
>
(
x_sz
,
x
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
x_sz
,
x
.
data
());
RandomVec
<
T
>
(
w_sz
,
w
.
data
()
,
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
w_sz
,
w
.
data
());
ref
(
seq_len
,
(
const
T
*
)
x
.
data
(),
(
const
T
*
)
w
.
data
(),
alpharef
.
data
(),
ref
(
seq_len
,
(
const
T
*
)
x
.
data
(),
(
const
T
*
)
w
.
data
(),
alpharef
.
data
(),
trackref
.
data
(),
tag_num
);
trackref
.
data
(),
tag_num
);
...
@@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() {
...
@@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() {
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestKernelVBroadcastTuples
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
for
(
int
w
:
TestSizes
())
{
std
::
vector
<
T
>
x
(
w
);
RandomVec
<
T
>
(
w
,
x
.
data
());
const
T
*
x_data
=
x
.
data
();
for
(
int64_t
h
:
{
1
,
2
,
6
})
{
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
VBroadcastTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
y
(
w
*
h
);
T
*
y_data
=
y
.
data
();
ref
(
x_data
,
y_data
,
h
,
w
);
TestAllImpls
<
KT
,
jit
::
VBroadcastTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>
,
int64_t
>
(
static_cast
<
int64_t
>
(
w
),
x
,
y
,
h
,
static_cast
<
int64_t
>
(
w
));
}
}
}
#define TEST_CPU_KERNEL(test_tuple, kernel_type) \
#define TEST_CPU_KERNEL(test_tuple, kernel_type) \
TEST(JITKernel, kernel_type) { \
TEST(JITKernel, kernel_type) { \
TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
...
@@ -949,6 +990,7 @@ TEST_CPU_KERNEL(XYNTuples, kVSquare);
...
@@ -949,6 +990,7 @@ TEST_CPU_KERNEL(XYNTuples, kVSquare);
TEST_CPU_KERNEL
(
XYNTuples
,
kVExp
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVExp
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVSigmoid
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVSigmoid
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVTanh
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVTanh
);
TEST_CPU_KERNEL
(
XYNTuples
,
kVCopy
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMCtHt
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMCtHt
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMC1H1
);
TEST_CPU_KERNEL
(
LSTMTuples
,
kLSTMC1H1
);
...
@@ -966,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
...
@@ -966,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
TEST_CPU_KERNEL
(
SgdTuples
,
kSgd
);
TEST_CPU_KERNEL
(
SgdTuples
,
kSgd
);
TEST_CPU_KERNEL
(
LayerNormTuples
,
kLayerNorm
);
TEST_CPU_KERNEL
(
LayerNormTuples
,
kLayerNorm
);
TEST_CPU_KERNEL
(
CRFDecodingTuples
,
kCRFDecoding
);
TEST_CPU_KERNEL
(
CRFDecodingTuples
,
kCRFDecoding
);
TEST_CPU_KERNEL
(
VBroadcastTuples
,
kVBroadcast
);
TEST
(
JITKernel_key
,
lstm
)
{
TEST
(
JITKernel_key
,
lstm
)
{
jit
::
lstm_attr_t
attr1
(
8
,
jit
::
kVIdentity
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
jit
::
lstm_attr_t
attr1
(
8
,
jit
::
kVIdentity
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
...
...
paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/requantize_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
mkldnn
::
memory
;
using
mkldnn
::
primitive
;
using
mkldnn
::
reorder
;
using
platform
::
to_void_cast
;
using
Tensor
=
framework
::
Tensor
;
using
framework
::
DataLayout
;
using
mkldnn
::
stream
;
using
platform
::
GetMKLDNNFormat
;
template
<
typename
T
>
class
ReQuantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
scale_in
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_out
=
ctx
.
Attr
<
float
>
(
"Scale_out"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MKLDNNDeviceContext
>();
const
auto
&
engine
=
dev_ctx
.
GetEngine
();
std
::
vector
<
primitive
>
pipeline
;
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
mkldnn
::
memory
::
data_type
dst_dt
=
src_dt
;
// TODO(Xiaoli) support
// requantize from different
// data type (e.g., s8 to u8)
mkldnn
::
memory
::
format
src_fmt
=
memory
::
format
::
nhwc
;
mkldnn
::
memory
::
format
dst_fmt
=
memory
::
format
::
nhwc
;
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
float
scale_shift
=
scale_out
/
scale_in
;
mkldnn
::
primitive_attr
attri
;
int
mask
=
0
;
attri
.
set_output_scales
(
mask
,
{
scale_shift
});
auto
src_md
=
platform
::
MKLDNNMemDesc
({
src_tz
},
src_dt
,
src_fmt
);
auto
src_pd
=
mkldnn
::
memory
::
primitive_desc
(
src_md
,
engine
);
auto
src_memory
=
std
::
make_shared
<
mkldnn
::
memory
>
(
src_pd
,
to_void_cast
<
T
>
(
input_data
));
std
::
shared_ptr
<
primitive
::
at
>
src_memory_p
=
std
::
shared_ptr
<
primitive
::
at
>
(
new
primitive
::
at
(
*
src_memory
));
auto
dst_md
=
platform
::
MKLDNNMemDesc
({
dst_tz
},
dst_dt
,
dst_fmt
);
auto
dst_pd
=
mkldnn
::
memory
::
primitive_desc
(
dst_md
,
engine
);
auto
dst_memory
=
mkldnn
::
memory
(
dst_pd
,
to_void_cast
<
T
>
(
output_data
));
auto
reorder_pd
=
std
::
shared_ptr
<
reorder
::
primitive_desc
>
(
new
reorder
::
primitive_desc
(
src_pd
,
dst_pd
,
attri
));
auto
reorder_p
=
std
::
shared_ptr
<
reorder
>
(
new
reorder
(
*
reorder_pd
,
*
src_memory_p
,
dst_memory
));
pipeline
.
push_back
(
*
reorder_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
dst_memory
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
requantize
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ReQuantOpKernel
<
int8_t
>
,
ops
::
ReQuantOpKernel
<
uint8_t
>
);
paddle/fluid/operators/recurrent_op.cc
浏览文件 @
2c4fcaa6
...
@@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase {
...
@@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase {
const
std
::
vector
<
std
::
string
>
&
src_vars
,
const
std
::
vector
<
std
::
string
>
&
src_vars
,
framework
::
Scope
*
dst_scope
,
framework
::
Scope
*
dst_scope
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
Callback
callback
)
{
Callback
callback
,
bool
is_backward
=
false
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
,
is_backward
);
}
}
}
}
...
@@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase {
...
@@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase {
const
std
::
vector
<
std
::
string
>
&
src_vars
,
const
std
::
vector
<
std
::
string
>
&
src_vars
,
const
framework
::
Scope
&
dst_scope
,
const
framework
::
Scope
&
dst_scope
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
const
std
::
vector
<
std
::
string
>
&
dst_vars
,
Callback
callback
)
{
Callback
callback
,
bool
is_backward
=
false
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
,
is_backward
);
}
}
}
}
...
@@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase {
...
@@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase {
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
const
std
::
string
&
src_var_name
,
const
std
::
string
&
src_var_name
,
framework
::
Scope
*
dst_scope
,
framework
::
Scope
*
dst_scope
,
const
std
::
string
&
dst_var_name
,
Callback
callback
)
{
const
std
::
string
&
dst_var_name
,
Callback
callback
,
bool
is_backward
=
false
)
{
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
);
if
(
is_backward
&&
src_var
==
nullptr
)
{
return
;
}
PADDLE_ENFORCE
(
src_var
!=
nullptr
,
"%s is not found."
,
src_var_name
);
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
*
dst_var
=
dst_scope
->
Var
(
dst_var_name
);
auto
*
dst_var
=
dst_scope
->
Var
(
dst_var_name
);
...
@@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase {
...
@@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase {
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
static
void
AccessTensor
(
const
framework
::
Scope
&
src_scope
,
const
std
::
string
&
src_var_name
,
const
std
::
string
&
src_var_name
,
const
framework
::
Scope
&
dst_scope
,
const
framework
::
Scope
&
dst_scope
,
const
std
::
string
&
dst_var_name
,
Callback
callback
)
{
const
std
::
string
&
dst_var_name
,
Callback
callback
,
bool
is_backward
=
false
)
{
auto
*
dst_var
=
dst_scope
.
FindVar
(
dst_var_name
);
if
(
is_backward
&&
dst_var
==
nullptr
)
{
return
;
}
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
auto
*
src_var
=
src_scope
.
FindVar
(
src_var_name
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
);
PADDLE_ENFORCE
(
src_var
!=
nullptr
,
"%s is not found."
,
src_var_name
);
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
src_tensor
=
src_var
->
Get
<
framework
::
LoDTensor
>
();
auto
*
dst_var
=
dst_scope
.
FindVar
(
dst_var_name
);
PADDLE_ENFORCE
(
dst_var
!=
nullptr
,
"%s is not found."
,
dst_var_name
);
PADDLE_ENFORCE
(
dst_var
!=
nullptr
);
auto
*
dst_tensor
=
dst_var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dst_tensor
=
dst_var
->
GetMutable
<
framework
::
LoDTensor
>
();
callback
(
src_tensor
,
dst_tensor
);
callback
(
src_tensor
,
dst_tensor
);
}
}
...
@@ -345,7 +357,8 @@ class RecurrentGradOp : public RecurrentBase {
...
@@ -345,7 +357,8 @@ class RecurrentGradOp : public RecurrentBase {
auto
dims
=
framework
::
vectorize
(
inside
->
dims
());
auto
dims
=
framework
::
vectorize
(
inside
->
dims
());
dims
.
erase
(
dims
.
begin
());
dims
.
erase
(
dims
.
begin
());
inside
->
Resize
(
framework
::
make_ddim
(
dims
));
inside
->
Resize
(
framework
::
make_ddim
(
dims
));
});
},
true
/*is_backward*/
);
auto
og_set
=
List2Set
(
Inputs
(
kOutputGrads
));
auto
og_set
=
List2Set
(
Inputs
(
kOutputGrads
));
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
))
{
...
@@ -454,7 +467,8 @@ class RecurrentGradOp : public RecurrentBase {
...
@@ -454,7 +467,8 @@ class RecurrentGradOp : public RecurrentBase {
auto
dst
=
outside
->
Slice
(
seq_offset
,
seq_offset
+
1
);
auto
dst
=
outside
->
Slice
(
seq_offset
,
seq_offset
+
1
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
&
dst
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
&
dst
);
});
},
true
/*is_backward*/
);
VLOG
(
5
)
<<
"Link outside gradient finished "
;
VLOG
(
5
)
<<
"Link outside gradient finished "
;
if
(
step_id
+
1
==
seq_len
)
{
// at_end
if
(
step_id
+
1
==
seq_len
)
{
// at_end
...
@@ -467,7 +481,8 @@ class RecurrentGradOp : public RecurrentBase {
...
@@ -467,7 +481,8 @@ class RecurrentGradOp : public RecurrentBase {
outside
->
Resize
(
inside
.
dims
());
outside
->
Resize
(
inside
.
dims
());
outside
->
mutable_data
(
place
,
inside
.
type
());
outside
->
mutable_data
(
place
,
inside
.
type
());
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
outside
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
outside
);
});
},
true
/*is_backward*/
);
VLOG
(
5
)
<<
"Link initialize state gradient finished "
;
VLOG
(
5
)
<<
"Link initialize state gradient finished "
;
}
}
scopes
.
Next
();
scopes
.
Next
();
...
@@ -608,10 +623,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
...
@@ -608,10 +623,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
std
::
vector
<
std
::
string
>
input
{
kInputs
,
kInitialStates
};
std
::
vector
<
std
::
string
>
input
{
kInputs
,
kInitialStates
};
std
::
vector
<
std
::
string
>
output
{
kOutputs
};
std
::
vector
<
std
::
string
>
output
{
kOutputs
};
for
(
auto
&
s
:
input
)
{
for
(
auto
&
s
:
input
)
{
// NOTE(zcd): In some case, some of kInputs doesn't have gradient.
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
framework
::
GradVarName
(
s
)),
"Cannot find the gradient variable %s"
,
framework
::
GradVarName
(
s
));
}
}
for
(
auto
&
s
:
output
)
{
for
(
auto
&
s
:
output
)
{
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
s
));
...
...
paddle/fluid/operators/requantize_op.cc
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/requantize_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
framework
::
OpKernelType
ReQuantOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
=
framework
::
LibraryType
::
kMKLDNN
;
framework
::
DataLayout
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
);
}
void
ReQuantOpMaker
::
Make
()
{
AddInput
(
"Input"
,
"input data"
);
AddOutput
(
"Output"
,
"output data"
);
AddAttr
<
float
>
(
"Scale_in"
,
"scale in data"
).
SetDefault
({
1.0
f
});
AddAttr
<
float
>
(
"Scale_out"
,
"scale out data"
).
SetDefault
({
1.0
f
});
AddComment
(
R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC"
);
}
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
requantize
,
ops
::
ReQuantOp
,
ops
::
ReQuantOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle/fluid/operators/requantize_op.h
0 → 100644
浏览文件 @
2c4fcaa6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
OpKernelType
;
using
framework
::
Tensor
;
class
ReQuantOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
ctx
->
SetOutputDim
(
"Output"
,
ctx
->
GetInputDim
(
"Input"
));
ctx
->
ShareLoD
(
"Input"
,
/*->*/
"Output"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
ReQuantOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reshape_op.cc
浏览文件 @
2c4fcaa6
...
@@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel {
...
@@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel {
static
framework
::
DDim
ValidateShape
(
const
std
::
vector
<
int
>
shape
,
static
framework
::
DDim
ValidateShape
(
const
std
::
vector
<
int
>
shape
,
const
framework
::
DDim
&
in_dims
)
{
const
framework
::
DDim
&
in_dims
)
{
const
int64_t
in_size
=
framework
::
product
(
in_dims
);
const
int64_t
in_size
=
framework
::
product
(
in_dims
);
auto
in_dims_vec
=
framework
::
vectorize
(
in_dims
);
bool
all_positive
=
std
::
all_of
(
in_dims_vec
.
cbegin
(),
in_dims_vec
.
cend
(),
[](
int64_t
i
)
{
return
i
>
0
;
});
// only one dimension can be set to -1, whose size will be automatically
// only one dimension can be set to -1, whose size will be automatically
// infered.
// infered.
const
int64_t
unk_dim_val
=
-
1
;
const
int64_t
unk_dim_val
=
-
1
;
...
@@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
...
@@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
}
}
if
(
unk_dim_idx
!=
-
1
)
{
if
(
unk_dim_idx
!=
-
1
)
{
if
(
in_size
>
0
)
{
if
(
all_positive
)
{
// in_size < 0 and is un-determinate in compile time, skip the check,
// in_size < 0 and is un-determinate in compile time, skip the check,
// for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, in_size = -8, output_shape[0] = 0
// capacity = -24, in_size = -8, output_shape[0] = 0
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
浏览文件 @
2c4fcaa6
...
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"calibration_data"
,
"the calibration data for int8"
);
AddAttr
<
std
::
string
>
(
"calibration_data"
,
"the calibration data for int8"
);
AddAttr
<
std
::
string
>
(
"engine_serialized_data"
,
"the serialized data contains the all info of the ICUDAEngine"
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"engine_key"
,
"engine_key"
,
"The engine_key here is used to distinguish different TRT Engines"
);
"The engine_key here is used to distinguish different TRT Engines"
);
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
2c4fcaa6
...
@@ -16,8 +16,10 @@
...
@@ -16,8 +16,10 @@
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
...
@@ -31,37 +33,6 @@ namespace paddle {
...
@@ -31,37 +33,6 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
// NOLINT
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
case
FluidDT
::
VarType_Type_FP32
:
return
TRT_DT
::
kFLOAT
;
case
FluidDT
::
VarType_Type_INT32
:
return
TRT_DT
::
kINT32
;
default:
return
TRT_DT
::
kINT32
;
}
PADDLE_THROW
(
"unkown type"
);
return
TRT_DT
::
kINT32
;
}
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>
&
shape
)
{
PADDLE_ENFORCE_GT
(
shape
.
size
(),
1UL
,
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
"TensorRT' tensor input requires at most 4 dimensions"
);
PADDLE_ENFORCE
(
shape
.
size
()
==
4UL
||
shape
.
size
()
==
2UL
);
if
(
shape
.
size
()
==
4UL
)
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
shape
[
2
],
shape
[
3
]);
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
1
,
1
);
}
}
// namespace // NOLINT
using
inference
::
Singleton
;
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TensorRTEngine
;
using
inference
::
tensorrt
::
TensorRTEngine
;
using
inference
::
tensorrt
::
TRTInt8Calibrator
;
using
inference
::
tensorrt
::
TRTInt8Calibrator
;
...
@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool
enable_int8_
;
bool
enable_int8_
;
std
::
string
calibration_data_
;
std
::
string
calibration_data_
;
std
::
string
engine_key_
;
std
::
string
engine_key_
;
std
::
string
engine_serialized_data_
;
bool
calibration_mode_
;
bool
calibration_mode_
;
public:
public:
...
@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
enable_int8_
=
Attr
<
bool
>
(
"enable_int8"
);
enable_int8_
=
Attr
<
bool
>
(
"enable_int8"
);
calibration_data_
=
Attr
<
std
::
string
>
(
"calibration_data"
);
calibration_data_
=
Attr
<
std
::
string
>
(
"calibration_data"
);
engine_key_
=
Attr
<
std
::
string
>
(
"engine_key"
);
engine_key_
=
Attr
<
std
::
string
>
(
"engine_key"
);
engine_serialized_data_
=
Attr
<
std
::
string
>
(
"engine_serialized_data"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
for
(
const
auto
&
param
:
params
)
{
for
(
const
auto
&
param
:
params
)
{
...
@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunCalibration
(
scope
,
dev_place
);
RunCalibration
(
scope
,
dev_place
);
return
;
return
;
}
}
RunTrt
(
scope
,
dev_place
);
auto
*
trt_engine
=
GetEngine
(
scope
,
dev_place
);
RunTrt
(
scope
,
dev_place
,
trt_engine
);
}
}
void
RunCalibration
(
const
framework
::
Scope
&
scope
,
void
RunCalibration
(
const
framework
::
Scope
&
scope
,
...
@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
LOG_FIRST_N
(
INFO
,
1
)
<<
"The TRT engine: "
<<
engine_key_
LOG_FIRST_N
(
INFO
,
1
)
<<
"The TRT engine: "
<<
engine_key_
<<
" is running calibration trt int8... "
;
<<
" is running calibration trt int8... "
;
int
runtime_batch
=
1
;
int
runtime_batch
=
1
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
!
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
(
engine_key_
))
{
if
(
!
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
(
engine_key_
))
{
TRTCalibratorEngine
*
calib_res
=
TRTCalibratorEngine
*
calib_res
=
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Create
(
engine_key_
);
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Create
(
engine_key_
);
...
@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
calib_buffers
,
runtime_batch
,
engine_key_
,
dev_place
));
calib_buffers
,
runtime_batch
,
engine_key_
,
dev_place
));
calib_res
->
thr_
.
reset
(
new
std
::
thread
([
&
]()
{
calib_res
->
thr_
.
reset
(
new
std
::
thread
([
&
]()
{
calib_res
->
engine_
.
reset
(
new
TensorRTEngine
(
calib_res
->
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
max_batch_size_
,
workspace_size_
,
enable_int8_
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calib_res
->
calib_
.
get
()
,
calib_res
->
calib_
.
get
()
));
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
VLOG
(
3
)
<<
"start the calib trt engine thread"
;
VLOG
(
3
)
<<
"start the calib trt engine thread"
;
Prepare
(
scope
,
dev_plac
e
,
calib_res
->
engine_
.
get
());
Prepare
TRTEngine
(
scop
e
,
calib_res
->
engine_
.
get
());
}));
}));
}
}
...
@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunNativeImpl
(
scope
,
dev_place
);
RunNativeImpl
(
scope
,
dev_place
);
}
}
void
RunTrt
(
const
framework
::
Scope
&
scope
,
void
RunTrt
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
const
platform
::
Place
&
dev_plac
e
)
const
{
TensorRTEngine
*
engin
e
)
const
{
int
runtime_batch
=
1
;
int
runtime_batch
=
1
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
trt_engine_
.
get
()
==
nullptr
)
{
trt_engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calibrator_
.
get
()));
Prepare
(
scope
,
dev_place
,
trt_engine_
.
get
());
}
auto
*
engine
=
trt_engine_
.
get
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
std
::
vector
<
std
::
string
>
output_maps
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
// Convert input tensor from fluid to engine.
int
num_inputs
=
0
;
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
num_inputs
+=
1
;
}
const
int
num_bindings
=
num_inputs
+
Outputs
(
"Ys"
).
size
();
std
::
vector
<
void
*>
buffers
(
num_bindings
);
// Bind input tensor to TRT.
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
if
(
param_names_
.
count
(
x
))
continue
;
// convert input and copy to TRT engine's buffer
// convert input and copy to TRT engine's buffer
...
@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
runtime_batch
=
t_shape
[
0
];
runtime_batch
=
t_shape
[
0
];
if
(
platform
::
is_cpu_place
(
t
.
place
()))
{
engine
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
else
{
engine
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
}
cudaStreamSynchronize
(
stream
);
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
x
.
c_str
());
PADDLE_ENFORCE_LE
(
runtime_batch
,
max_batch_size_
);
PADDLE_ENFORCE
(
bind_index
<
num_bindings
,
// Execute the engine.
"The bind index should be less than num_bindings"
);
engine
->
Execute
(
runtime_batch
);
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
t
.
data
<
float
>
());
}
//
Convert output tensor from engine to fluid
//
Bind output tensor to TRT.
int
output_index
=
0
;
int
output_index
=
0
;
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
VLOG
(
4
)
<<
y
;
const
int
bind_index
=
// convert output and copy to fluid.
engine
->
engine
()
->
getBindingIndex
(
output_maps
[
output_index
].
c_str
());
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
output_maps
[
output_index
]);
auto
dims
=
engine
->
engine
()
->
getBindingDimensions
(
bind_index
);
auto
dims
=
trt_t
->
getDimensions
();
// Use the output ITensor's dims to reshape the Fluid Tensor.
// Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim.
// The ITensor doesn't contain the batch size dim.
std
::
vector
<
int
>
ddim
;
std
::
vector
<
int
>
ddim
;
...
@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
for
(
int
i
=
0
;
i
<
dims
.
nbDims
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dims
.
nbDims
;
i
++
)
{
ddim
.
push_back
(
dims
.
d
[
i
]);
ddim
.
push_back
(
dims
.
d
[
i
]);
}
}
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
// TODO(Superjomn) change this float to dtype size.
PADDLE_ENFORCE
(
bind_index
<
num_bindings
,
auto
size
=
"The bind index should be less than num_bindings"
);
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
runtime_batch
;
buffers
[
bind_index
]
=
static_cast
<
void
*>
(
fluid_t
->
mutable_data
<
float
>
(
engine
->
GetOutputInGPU
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
)));
output_maps
[
output_index
],
fluid_t
->
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
)),
size
*
sizeof
(
float
));
output_index
+=
1
;
output_index
+=
1
;
}
}
PADDLE_ENFORCE_LE
(
runtime_batch
,
max_batch_size_
);
// Execute the engine.
engine
->
Execute
(
runtime_batch
,
&
buffers
,
stream
);
cudaStreamSynchronize
(
stream
);
cudaStreamSynchronize
(
stream
);
}
}
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
TensorRTEngine
*
GetEngine
(
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
const
{
const
platform
::
Place
&
dev_place
)
const
{
if
(
!
trt_engine_
)
{
trt_engine_
.
reset
(
new
inference
::
tensorrt
::
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
enable_int8_
,
calibrator_
.
get
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
if
(
!
engine_serialized_data_
.
empty
())
{
trt_engine_
->
Deserialize
(
engine_serialized_data_
);
}
else
{
PrepareTRTEngine
(
scope
,
trt_engine_
.
get
());
}
}
return
trt_engine_
.
get
();
}
void
PrepareTRTEngine
(
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
const
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
"kernel etc). This process may cost a lot of time."
;
framework
::
proto
::
BlockDesc
block_desc
;
framework
::
proto
::
BlockDesc
block_proto
;
block_desc
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
block_proto
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
std
::
vector
<
std
::
string
>
output_maps
=
std
::
vector
<
std
::
string
>
inputs
=
Inputs
(
"Xs"
);
std
::
vector
<
std
::
string
>
outputs
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
engine
->
InitNetwork
();
framework
::
BlockDesc
block
(
nullptr
/*programdesc*/
,
&
block_desc
);
VLOG
(
4
)
<<
"parsed var size "
<<
block
.
AllVars
().
size
();
// Add inputs
VLOG
(
4
)
<<
"declare inputs"
;
for
(
auto
&
input
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
input
))
continue
;
VLOG
(
4
)
<<
"declare input "
<<
input
;
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
input
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
*
var
=
block
.
FindVar
(
input
);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
t_shape
));
}
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlock
(
block_desc
,
param_names_
,
scope
,
engine
);
.
ConvertBlockToTRTEngine
(
&
block_desc
,
scope
,
inputs
,
param_names_
,
outputs
,
engine
);
// Add outputs
for
(
auto
&
output
:
output_maps
)
{
engine
->
DeclareOutput
(
output
);
}
engine
->
FreezeNetwork
();
}
}
};
};
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
浏览文件 @
2c4fcaa6
...
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
...
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
std
::
vector
<
std
::
string
>
({
"z0"
}));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"engine_serialized_data"
,
std
::
string
(
""
));
LOG
(
INFO
)
<<
"create engine op"
;
LOG
(
INFO
)
<<
"create engine op"
;
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
...
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
...
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
std
::
vector
<
std
::
string
>
({
"z3"
}));
std
::
vector
<
std
::
string
>
({
"z3"
}));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
engine_op_desc
.
SetAttr
(
"engine_serialized_data"
,
std
::
string
(
""
));
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
2c4fcaa6
...
@@ -38,6 +38,22 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
...
@@ -38,6 +38,22 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu "
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
"until the gpu has no memory left for another trunk."
);
DEFINE_double
(
initial_gpu_memory_in_mb
,
-
1.0
,
"GPU memory chunk size in MB."
"Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
"chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
"chunk when the first chunk is not enough. This flag has higher priority "
"than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0."
);
DEFINE_double
(
reallocate_gpu_memory_in_mb
,
-
1.0
,
"GPU memory chunk size in MB."
"If FLAGS_initial_gpu_memory_in_mb is set and "
"FLAGS_reallocate_gpu_memory_in_mb "
"is less than 0, it would be replaced by "
"FLAGS_initial_gpu_memory_in_mb. Disable "
"when FLAGS_initial_gpu_memory_in_mb is less than 0."
);
DEFINE_bool
(
DEFINE_bool
(
enable_cublas_tensor_op_math
,
false
,
enable_cublas_tensor_op_math
,
false
,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
...
@@ -211,13 +227,54 @@ size_t GpuMaxChunkSize() {
...
@@ -211,13 +227,54 @@ size_t GpuMaxChunkSize() {
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
(
total
-
reserving
));
(
total
-
reserving
));
PADDLE_ENFORCE_LE
(
allocating
,
available
,
PADDLE_ENFORCE_LE
(
allocating
,
available
,
"Insufficient GPU memory to allocation."
);
"Insufficient GPU memory to allocation."
);
return
allocating
;
return
allocating
;
}
}
size_t
GpuFirstAllocateChunkSize
()
{
if
(
FLAGS_initial_gpu_memory_in_mb
<=
0
)
{
return
GpuMaxChunkSize
();
}
size_t
total
=
0
;
size_t
available
=
0
;
GpuMemoryUsage
(
&
available
,
&
total
);
VLOG
(
10
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
size_t
initial_mem
=
static_cast
<
size_t
>
(
FLAGS_initial_gpu_memory_in_mb
*
(
1
<<
20
));
PADDLE_ENFORCE_LE
(
initial_mem
,
available
,
"Insufficient GPU memory to allocation."
);
return
initial_mem
;
}
size_t
GpuReAllocateChunkSize
()
{
if
(
FLAGS_initial_gpu_memory_in_mb
<=
0
)
{
return
GpuMaxChunkSize
();
}
double
reallocate_mem
=
FLAGS_reallocate_gpu_memory_in_mb
;
if
(
reallocate_mem
<
0
)
{
PADDLE_ENFORCE
(
FLAGS_initial_gpu_memory_in_mb
>
0
,
"FLAGS_init_gpu_memory_to_use_mb must be larger than 0"
);
reallocate_mem
=
FLAGS_initial_gpu_memory_in_mb
;
}
size_t
total
=
0
;
size_t
available
=
0
;
GpuMemoryUsage
(
&
available
,
&
total
);
VLOG
(
10
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
size_t
realloc_mem
=
static_cast
<
size_t
>
(
reallocate_mem
*
(
1
<<
20
));
PADDLE_ENFORCE_LE
(
realloc_mem
,
available
,
"Insufficient GPU memory to allocation."
);
return
realloc_mem
;
}
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
)
{
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
),
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
),
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
2c4fcaa6
...
@@ -66,6 +66,12 @@ size_t GpuMinChunkSize();
...
@@ -66,6 +66,12 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
//! Get the maximum chunk size for GPU buddy allocator.
size_t
GpuMaxChunkSize
();
size_t
GpuMaxChunkSize
();
//! Get init chunk size for GPU buddy allocator.
size_t
GpuFirstAllocateChunkSize
();
//! Get reallocate chunk size for GPU buddy allocator.
size_t
GpuReAllocateChunkSize
();
//! Copy memory from address src to dst asynchronously.
//! Copy memory from address src to dst asynchronously.
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
);
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
);
...
...
paddle/fluid/platform/temporary_allocator.cc
浏览文件 @
2c4fcaa6
...
@@ -77,6 +77,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
...
@@ -77,6 +77,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
}
}
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
temp_allocation
->
ptr
()
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
temp_allocation
->
ptr
()
<<
" size: "
<<
temp_allocation
->
size
();
<<
" size: "
<<
temp_allocation
->
size
();
alloc
::
AllocationDeleter
()(
temp_allocation
);
}
}
size_t
TemporaryAllocator
::
TemporaryAllocationQueueSize
()
{
size_t
TemporaryAllocator
::
TemporaryAllocationQueueSize
()
{
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
2c4fcaa6
...
@@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
...
@@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
.
def
(
"enable_tensorrt_engine"
,
&
AnalysisConfig
::
EnableTensorRtEngine
,
.
def
(
"enable_tensorrt_engine"
,
&
AnalysisConfig
::
EnableTensorRtEngine
,
py
::
arg
(
"workspace_size"
)
=
1
<<
20
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"workspace_size"
)
=
1
<<
20
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"min_subgraph_size"
)
=
3
,
py
::
arg
(
"min_subgraph_size"
)
=
3
,
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
)
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
,
py
::
arg
(
"use_static"
)
=
true
)
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
py
::
arg
(
"x"
)
=
true
)
py
::
arg
(
"x"
)
=
true
)
...
...
python/paddle/fluid/__init__.py
浏览文件 @
2c4fcaa6
...
@@ -159,6 +159,7 @@ def __bootstrap__():
...
@@ -159,6 +159,7 @@ def __bootstrap__():
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
read_env_flags
+=
[
'initial_gpu_memory_in_mb'
,
'reallocate_gpu_memory_in_mb'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
...
...
python/paddle/fluid/imperative/layer_object_helper.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
copy
import
six
from
..framework
import
Parameter
,
_in_imperative_mode
from
..param_attr
import
ParamAttr
from
..
import
core
from
six.moves
import
zip
from
..layer_helper_base
import
LayerHelperBase
class
LayerObjectHelper
(
LayerHelperBase
):
def
__init__
(
self
,
name
):
super
(
LayerObjectHelper
,
self
).
__init__
(
name
,
layer_type
=
name
)
def
append_op
(
self
,
type
=
None
,
inputs
=
None
,
outputs
=
None
,
attrs
=
None
,
stop_gradient
=
None
):
"""append an operator for this layer object.
Args:
type: operator type
inputs: input variable of the operator
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
return
self
.
main_program
.
current_block
().
append_op
(
type
=
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
stop_gradient
)
def
_multiple_input
(
self
,
inputs_in
):
inputs
=
inputs_in
ret
=
[]
if
isinstance
(
inputs
,
(
list
,
tuple
)):
for
inp
in
inputs
:
ret
.
append
(
self
.
to_variable
(
inp
))
else
:
ret
.
append
(
self
.
to_variable
(
inputs
))
return
ret
# TODO: make it public when we need it
def
_input
(
self
,
inputs_in
):
inputs
=
self
.
_multiple_input
(
inputs_in
)
if
len
(
inputs
)
!=
1
:
raise
"{0} layer only takes one input"
.
format
(
self
.
layer_type
)
return
inputs
[
0
]
def
_multiple_param_attr
(
self
,
length
,
param_attr_in
=
None
):
param_attr
=
param_attr_in
if
isinstance
(
param_attr
,
ParamAttr
):
param_attr
=
[
param_attr
]
if
len
(
param_attr
)
!=
1
and
len
(
param_attr
)
!=
length
:
raise
ValueError
(
"parameter number mismatch"
)
elif
len
(
param_attr
)
==
1
and
length
!=
1
:
tmp
=
[
None
]
*
length
for
i
in
six
.
moves
.
range
(
length
):
tmp
[
i
]
=
copy
.
deepcopy
(
param_attr
[
0
])
param_attr
=
tmp
return
param_attr
def
iter_inputs_and_params
(
self
,
inputs_in
,
param_attr_in
=
None
):
"""Access all inputs and params one by one
Args:
inputs_in: inputs to be iter
param_attr_in: param_attr to be iter
Returns input, param_attr
"""
inputs
=
inputs_in
if
(
inputs_in
is
not
None
)
else
[]
inputs
=
self
.
_multiple_input
(
inputs
)
param_attrs
=
self
.
_multiple_param_attr
(
len
(
inputs
),
param_attr_in
)
for
ipt
,
param_attr
in
zip
(
inputs
,
param_attrs
):
yield
ipt
,
param_attr
def
input_dtype
(
self
,
inputs_in
):
"""Get input data type
Args:
inputs_in: inputs wanted know the data type
Returns dtype of the input
"""
inputs
=
self
.
_multiple_input
(
inputs_in
)
dtype
=
None
for
each
in
inputs
:
if
dtype
is
None
:
dtype
=
each
.
dtype
elif
dtype
!=
each
.
dtype
:
raise
ValueError
(
"Data Type mismatch: %d to %d"
%
(
dtype
,
each
.
dtype
))
return
dtype
def
get_parameter
(
self
,
name
):
"""Get parameter specifically
Args:
name: parameter's name
Returns target parameter
"""
param
=
self
.
main_program
.
global_block
().
var
(
name
)
if
not
isinstance
(
param
,
Parameter
):
raise
ValueError
(
"no Parameter name %s found"
%
name
)
return
param
def
append_bias_op
(
self
,
input_var
,
dim_start
=
1
,
dim_end
=
None
,
bias_attr
=
None
):
"""Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var
Args:
input_var: the input variable. The len(input_var.shape) is
larger or equal than 2.
dim_start:
dim_end: the shape of the bias will be
bias_attr: the bias_attr of it
Return the Variable of after append bias op
"""
size
=
list
(
input_var
.
shape
[
dim_start
:
dim_end
])
bias_attr
=
bias_attr
if
not
bias_attr
:
return
input_var
b
=
self
.
create_parameter
(
attr
=
bias_attr
,
shape
=
size
,
dtype
=
input_var
.
dtype
,
is_bias
=
True
)
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
type
=
'elementwise_add'
,
inputs
=
{
'X'
:
[
input_var
],
'Y'
:
[
b
]},
outputs
=
{
'Out'
:
[
tmp
]},
attrs
=
{
'axis'
:
dim_start
})
return
tmp
# TODO: this should not be called anymore after all activation func move to Layers
def
append_activation
(
self
,
input_var
,
act
=
None
,
use_cudnn
=
None
,
use_mkl_dnn
=
None
):
"""Append activation
Args:
input_var: the input variable. The len(input_var.shape) is
larger or equal than 2.
act: activation type
use_mkl_dnn: if use mkldnn
use_cudnn: if use cudnn
Return the Variable of after append activation
"""
act
=
act
if
act
is
None
:
return
input_var
if
isinstance
(
act
,
six
.
string_types
):
act
=
{
'type'
:
act
}
else
:
raise
TypeError
(
str
(
act
)
+
" should be unicode or str"
)
if
(
use_cudnn
is
not
None
)
and
use_cudnn
:
act
[
'use_cudnn'
]
=
use_cudnn
if
(
use_mkl_dnn
is
not
None
)
and
use_mkl_dnn
:
act
[
'use_mkldnn'
]
=
use_mkl_dnn
act_type
=
act
.
pop
(
'type'
)
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if
not
_in_imperative_mode
()
and
core
.
IsInplace
(
act_type
):
tmp
=
input_var
else
:
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
type
=
act_type
,
inputs
=
{
"X"
:
[
input_var
]},
outputs
=
{
"Out"
:
[
tmp
]},
attrs
=
act
)
return
tmp
def
is_instance
(
self
,
param
,
cls
):
"""Check if the input parameter is instance of input class
Args:
param: parameter to be check
cls: class of the parameter
Return result of the check (True or False)
"""
param
=
param
if
not
isinstance
(
param
,
cls
):
raise
TypeError
(
"The input {0} parameter of method {1} must be {2}"
,
param
,
self
.
layer_type
,
cls
.
__name__
)
python/paddle/fluid/imperative/layers.py
浏览文件 @
2c4fcaa6
...
@@ -19,8 +19,8 @@ import numpy as np
...
@@ -19,8 +19,8 @@ import numpy as np
import
collections
import
collections
from
..
import
unique_name
from
..
import
unique_name
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
.layer_object_helper
import
LayerObjectHelper
from
paddle.fluid
import
framework
from
paddle.fluid
import
framework
from
paddle.fluid.imperative
import
base
__all__
=
[
'Layer'
,
'PyLayer'
]
__all__
=
[
'Layer'
,
'PyLayer'
]
...
@@ -44,6 +44,8 @@ class Layer(core.Layer):
...
@@ -44,6 +44,8 @@ class Layer(core.Layer):
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_parameters
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
self
.
_sub_layers
=
collections
.
OrderedDict
()
self
.
_helper
=
LayerObjectHelper
(
self
.
_full_name
)
def
full_name
(
self
):
def
full_name
(
self
):
"""Full name for this layers.
"""Full name for this layers.
...
@@ -53,6 +55,51 @@ class Layer(core.Layer):
...
@@ -53,6 +55,51 @@ class Layer(core.Layer):
"""
"""
return
self
.
_full_name
return
self
.
_full_name
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
"""Create parameters for this layers.
Args:
attr: [ParamAttr] should be the parameter attribute for this parameter
shape: shape of the paramter
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
return
self
.
_helper
.
create_parameter
(
attr
,
shape
,
dtype
,
is_bias
,
default_initializer
)
# TODO: Add more parameter list when we need them
def
create_variable
(
self
,
name
=
None
,
persistable
=
None
,
dtype
=
None
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
):
"""Create Variable for this layers.
Args:
name: name of the variable
persistable: if set this variable persistable
dtype: data type of data in the variable
type: type of the variable
Returns created Variable.
"""
if
name
is
not
None
:
var_name
=
"."
.
join
([
self
.
_full_name
,
name
])
else
:
var_name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
_full_name
,
"_generated_var"
]))
return
self
.
_helper
.
main_program
.
current_block
().
create_var
(
name
=
var_name
,
persistable
=
persistable
,
dtype
=
dtype
,
type
=
type
)
def
parameters
(
self
,
include_sublayers
=
True
):
def
parameters
(
self
,
include_sublayers
=
True
):
"""Returns a list of Parameters from current and sub-layers.
"""Returns a list of Parameters from current and sub-layers.
...
...
python/paddle/fluid/imperative/nn.py
浏览文件 @
2c4fcaa6
...
@@ -41,21 +41,12 @@ class Conv2D(layers.Layer):
...
@@ -41,21 +41,12 @@ class Conv2D(layers.Layer):
bias_attr
=
None
,
bias_attr
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
dtype
=
core
.
VarDesc
.
VarType
.
FP32
):
assert
param_attr
is
not
False
,
"param_attr should not be False here."
assert
param_attr
is
not
False
,
"param_attr should not be False here."
super
(
Conv2D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
super
(
Conv2D
,
self
).
__init__
(
name_scope
)
# TODO(minqiyang): Move this to the top.
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
dtype
=
dtype
,
act
=
act
)
self
.
_groups
=
groups
self
.
_groups
=
groups
self
.
_stride
=
utils
.
convert_to_list
(
stride
,
2
,
'stride'
)
self
.
_stride
=
utils
.
convert_to_list
(
stride
,
2
,
'stride'
)
self
.
_padding
=
utils
.
convert_to_list
(
padding
,
2
,
'padding'
)
self
.
_padding
=
utils
.
convert_to_list
(
padding
,
2
,
'padding'
)
self
.
_dilation
=
utils
.
convert_to_list
(
dilation
,
2
,
'dilation'
)
self
.
_dilation
=
utils
.
convert_to_list
(
dilation
,
2
,
'dilation'
)
self
.
_act
=
act
if
not
isinstance
(
use_cudnn
,
bool
):
if
not
isinstance
(
use_cudnn
,
bool
):
raise
ValueError
(
"use_cudnn should be True or False"
)
raise
ValueError
(
"use_cudnn should be True or False"
)
self
.
_use_cudnn
=
use_cudnn
self
.
_use_cudnn
=
use_cudnn
...
@@ -80,28 +71,28 @@ class Conv2D(layers.Layer):
...
@@ -80,28 +71,28 @@ class Conv2D(layers.Layer):
std
=
(
2.0
/
filter_elem_num
)
**
0.5
std
=
(
2.0
/
filter_elem_num
)
**
0.5
return
Normal
(
0.0
,
std
,
0
)
return
Normal
(
0.0
,
std
,
0
)
self
.
_filter_param
=
self
.
_helper
.
create_parameter
(
self
.
_filter_param
=
self
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
attr
=
param_attr
,
shape
=
filter_shape
,
shape
=
filter_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
default_initializer
=
_get_default_param_initializer
())
default_initializer
=
_get_default_param_initializer
())
if
self
.
_use_cudnn
:
if
self
.
_use_cudnn
:
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNFwdAlgoCache"
,
name
=
"kCUDNNFwdAlgoCache"
,
persistable
=
True
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNBwdDataAlgoCache"
,
name
=
"kCUDNNBwdDataAlgoCache"
,
persistable
=
True
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_helper
.
create_variable
(
self
.
create_variable
(
name
=
"kCUDNNBwdFilterAlgoCache"
,
name
=
"kCUDNNBwdFilterAlgoCache"
,
persistable
=
True
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
type
=
core
.
VarDesc
.
VarType
.
RAW
)
self
.
_bias_param
=
self
.
_helper
.
create_parameter
(
self
.
_bias_param
=
self
.
create_parameter
(
attr
=
self
.
_helper
.
bias_attr
,
attr
=
bias_attr
,
shape
=
[
num_filters
],
shape
=
[
num_filters
],
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
is_bias
=
True
)
...
@@ -137,7 +128,7 @@ class Conv2D(layers.Layer):
...
@@ -137,7 +128,7 @@ class Conv2D(layers.Layer):
attrs
=
{
'axis'
:
1
})
attrs
=
{
'axis'
:
1
})
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_act
)
return
self
.
_helper
.
append_activation
(
pre_act
,
act
=
self
.
_act
)
class
Pool2D
(
layers
.
Layer
):
class
Pool2D
(
layers
.
Layer
):
...
@@ -167,9 +158,6 @@ class Pool2D(layers.Layer):
...
@@ -167,9 +158,6 @@ class Pool2D(layers.Layer):
super
(
Pool2D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
super
(
Pool2D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
dtype
=
dtype
)
self
.
_pool_type
=
pool_type
self
.
_pool_type
=
pool_type
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
self
.
_pool_size
=
utils
.
convert_to_list
(
pool_size
,
2
,
'pool_size'
)
self
.
_pool_padding
=
utils
.
convert_to_list
(
pool_padding
,
2
,
self
.
_pool_padding
=
utils
.
convert_to_list
(
pool_padding
,
2
,
...
@@ -216,28 +204,25 @@ class FC(layers.Layer):
...
@@ -216,28 +204,25 @@ class FC(layers.Layer):
self
.
_size
=
size
self
.
_size
=
size
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_dtype
=
dtype
self
.
_dtype
=
dtype
from
..layer_helper
import
LayerHelper
self
.
_param_attr
=
param_attr
self
.
_helper
=
LayerHelper
(
self
.
_bias_attr
=
param_attr
self
.
full_name
(),
self
.
_act
=
act
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
def
_build_once
(
self
,
input
):
def
_build_once
(
self
,
input
):
input_shape
=
input
.
shape
input_shape
=
input
.
shape
param_shape
=
[
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
self
.
_num_flatten_dims
:],
1
)
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
self
.
_num_flatten_dims
:],
1
)
]
+
[
self
.
_size
]
]
+
[
self
.
_size
]
self
.
_w
=
self
.
_helper
.
create_parameter
(
self
.
_w
=
self
.
create_parameter
(
attr
=
self
.
_
helper
.
param_attr
,
attr
=
self
.
_param_attr
,
shape
=
param_shape
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
is_bias
=
False
)
if
self
.
_
helper
.
bias
_attr
:
if
self
.
_
param
_attr
:
size
=
list
([
self
.
_size
])
size
=
list
([
self
.
_size
])
self
.
_b
=
self
.
_helper
.
create_parameter
(
self
.
_b
=
self
.
create_parameter
(
attr
=
self
.
_
helper
.
bias
_attr
,
attr
=
self
.
_
param
_attr
,
shape
=
size
,
shape
=
size
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
is_bias
=
True
)
...
@@ -275,7 +260,7 @@ class FC(layers.Layer):
...
@@ -275,7 +260,7 @@ class FC(layers.Layer):
else
:
else
:
pre_activation
=
pre_bias
pre_activation
=
pre_bias
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_activation
)
return
self
.
_helper
.
append_activation
(
pre_activation
,
act
=
self
.
_act
)
class
BatchNorm
(
layers
.
Layer
):
class
BatchNorm
(
layers
.
Layer
):
...
@@ -297,16 +282,12 @@ class BatchNorm(layers.Layer):
...
@@ -297,16 +282,12 @@ class BatchNorm(layers.Layer):
fuse_with_relu
=
False
,
fuse_with_relu
=
False
,
use_global_stats
=
False
):
use_global_stats
=
False
):
super
(
BatchNorm
,
self
).
__init__
(
name_scope
)
super
(
BatchNorm
,
self
).
__init__
(
name_scope
)
self
.
_param_attr
=
param_attr
self
.
_param_attr
=
bias_attr
self
.
_act
=
act
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
else
:
else
:
...
@@ -315,23 +296,23 @@ class BatchNorm(layers.Layer):
...
@@ -315,23 +296,23 @@ class BatchNorm(layers.Layer):
param_shape
=
[
num_channels
]
param_shape
=
[
num_channels
]
# create parameter
# create parameter
self
.
_scale
=
self
.
_helper
.
create_parameter
(
self
.
_scale
=
self
.
create_parameter
(
attr
=
self
.
_
helper
.
param_attr
,
attr
=
self
.
_param_attr
,
shape
=
param_shape
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
default_initializer
=
Constant
(
1.0
))
default_initializer
=
Constant
(
1.0
))
if
use_global_stats
and
self
.
_
helper
.
param_attr
.
learning_rate
==
0.
:
if
use_global_stats
and
self
.
_param_attr
.
learning_rate
==
0.
:
self
.
_scale
.
_stop_gradient
=
True
self
.
_scale
.
_stop_gradient
=
True
self
.
_bias
=
self
.
_helper
.
create_parameter
(
self
.
_bias
=
self
.
create_parameter
(
attr
=
self
.
_
helper
.
bias
_attr
,
attr
=
self
.
_
param
_attr
,
shape
=
param_shape
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
is_bias
=
True
)
if
use_global_stats
and
self
.
_
helper
.
bias
_attr
.
learning_rate
==
0.
:
if
use_global_stats
and
self
.
_
param
_attr
.
learning_rate
==
0.
:
self
.
_bias
.
_stop_gradient
=
True
self
.
_bias
.
_stop_gradient
=
True
self
.
_mean
=
self
.
_helper
.
create_parameter
(
self
.
_mean
=
self
.
create_parameter
(
attr
=
ParamAttr
(
attr
=
ParamAttr
(
name
=
moving_mean_name
,
name
=
moving_mean_name
,
initializer
=
Constant
(
0.0
),
initializer
=
Constant
(
0.0
),
...
@@ -341,7 +322,7 @@ class BatchNorm(layers.Layer):
...
@@ -341,7 +322,7 @@ class BatchNorm(layers.Layer):
dtype
=
self
.
_dtype
)
dtype
=
self
.
_dtype
)
self
.
_mean
.
_stop_gradient
=
True
self
.
_mean
.
_stop_gradient
=
True
self
.
_variance
=
self
.
_helper
.
create_parameter
(
self
.
_variance
=
self
.
create_parameter
(
attr
=
ParamAttr
(
attr
=
ParamAttr
(
name
=
moving_variance_name
,
name
=
moving_variance_name
,
initializer
=
Constant
(
1.0
),
initializer
=
Constant
(
1.0
),
...
@@ -401,7 +382,7 @@ class BatchNorm(layers.Layer):
...
@@ -401,7 +382,7 @@ class BatchNorm(layers.Layer):
})
})
# Currently, we don't support inplace in imperative mode
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
batch_norm_out
)
return
self
.
_helper
.
append_activation
(
batch_norm_out
,
self
.
_act
)
class
Embedding
(
layers
.
Layer
):
class
Embedding
(
layers
.
Layer
):
...
@@ -466,9 +447,7 @@ class Embedding(layers.Layer):
...
@@ -466,9 +447,7 @@ class Embedding(layers.Layer):
if
self
.
_remote_prefetch
:
if
self
.
_remote_prefetch
:
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
from
..layer_helper
import
LayerHelper
self
.
_w
=
self
.
create_parameter
(
self
.
_helper
=
LayerHelper
(
self
.
full_name
(),
param_attr
=
param_attr
)
self
.
_w
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_param_attr
,
attr
=
self
.
_param_attr
,
shape
=
self
.
_size
,
shape
=
self
.
_size
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
...
...
python/paddle/fluid/initializer.py
浏览文件 @
2c4fcaa6
...
@@ -19,7 +19,6 @@ import numpy as np
...
@@ -19,7 +19,6 @@ import numpy as np
from
.wrapped_decorator
import
signature_safe_contextmanager
from
.wrapped_decorator
import
signature_safe_contextmanager
from
.core
import
VarDesc
from
.core
import
VarDesc
from
.
import
unique_name
from
.
import
unique_name
from
.imperative
import
base
as
imperative_base
__all__
=
[
__all__
=
[
'Constant'
,
'Uniform'
,
'Normal'
,
'TruncatedNormal'
,
'Xavier'
,
'Bilinear'
,
'Constant'
,
'Uniform'
,
'Normal'
,
'TruncatedNormal'
,
'Xavier'
,
'Bilinear'
,
...
@@ -166,7 +165,7 @@ class ConstantInitializer(Initializer):
...
@@ -166,7 +165,7 @@ class ConstantInitializer(Initializer):
'force_cpu'
:
self
.
_force_cpu
or
force_init_on_cpu
()
'force_cpu'
:
self
.
_force_cpu
or
force_init_on_cpu
()
},
},
stop_gradient
=
True
)
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -246,7 +245,7 @@ class UniformInitializer(Initializer):
...
@@ -246,7 +245,7 @@ class UniformInitializer(Initializer):
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -325,7 +324,7 @@ class NormalInitializer(Initializer):
...
@@ -325,7 +324,7 @@ class NormalInitializer(Initializer):
outputs
=
{
"Out"
:
var
},
outputs
=
{
"Out"
:
var
},
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -404,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
...
@@ -404,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
outputs
=
{
"Out"
:
var
},
outputs
=
{
"Out"
:
var
},
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
attrs
=
{
"in_dtype"
:
out_var
.
dtype
,
"out_dtype"
:
var
.
dtype
})
"out_dtype"
:
var
.
dtype
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -510,7 +509,7 @@ class XavierInitializer(Initializer):
...
@@ -510,7 +509,7 @@ class XavierInitializer(Initializer):
"seed"
:
self
.
_seed
"seed"
:
self
.
_seed
},
},
stop_gradient
=
True
)
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -611,7 +610,7 @@ class MSRAInitializer(Initializer):
...
@@ -611,7 +610,7 @@ class MSRAInitializer(Initializer):
"seed"
:
self
.
_seed
"seed"
:
self
.
_seed
},
},
stop_gradient
=
True
)
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -710,7 +709,7 @@ class BilinearInitializer(Initializer):
...
@@ -710,7 +709,7 @@ class BilinearInitializer(Initializer):
'shape'
:
list
(
shape
),
'shape'
:
list
(
shape
),
value_name
:
values
value_name
:
values
})
})
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
@@ -769,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
...
@@ -769,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
value_name
:
values
value_name
:
values
},
},
stop_gradient
=
True
)
stop_gradient
=
True
)
if
not
imperative_base
.
enabled
():
if
not
framework
.
_in_imperative_mode
():
var
.
op
=
op
var
.
op
=
op
return
op
return
op
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
2c4fcaa6
...
@@ -15,45 +15,29 @@
...
@@ -15,45 +15,29 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
copy
import
copy
import
itertools
import
six
import
six
import
sys
import
numpy
as
np
from
.framework
import
Variable
,
Parameter
,
default_main_program
,
default_startup_program
,
dtype_is_floating
,
_in_imperative_mode
from
.framework
import
Parameter
,
dtype_is_floating
,
_in_imperative_mode
from
.
import
unique_name
from
.
import
unique_name
from
paddle.fluid.imperative
import
base
as
imperative_base
from
paddle.fluid.initializer
import
Constant
,
Xavier
from
paddle.fluid.initializer
import
Constant
,
Xavier
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.param_attr
import
ParamAttr
from
.
import
core
from
.
import
core
from
six.moves
import
zip
from
six.moves
import
zip
from
.layer_helper_base
import
LayerHelperBase
class
LayerHelper
(
object
):
class
LayerHelper
(
LayerHelperBase
):
def
__init__
(
self
,
layer_type
,
**
kwargs
):
def
__init__
(
self
,
layer_type
,
**
kwargs
):
self
.
kwargs
=
kwargs
self
.
kwargs
=
kwargs
self
.
layer_type
=
layer_type
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
name
=
self
.
kwargs
.
get
(
'name'
,
None
)
# TODO(panyx0718, minqiyang): imperative mode
# TODO(panyx0718, minqiyang): imperative mode
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# can not use both `layer_type` and `name`. Deprecate LayerHelper
# and write a Helper for imperative mode.
# and write a Helper for imperative mode.
if
name
is
None
:
if
name
is
None
:
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
self
.
layer_type
)
self
.
kwargs
[
'name'
]
=
unique_name
.
generate
(
layer_type
)
@
property
super
(
LayerHelper
,
self
).
__init__
(
def
name
(
self
):
self
.
kwargs
[
'name'
],
layer_type
=
layer_type
)
return
self
.
kwargs
[
'name'
]
@
property
def
main_program
(
self
):
return
default_main_program
()
@
property
def
startup_program
(
self
):
return
default_startup_program
()
def
to_variable
(
self
,
x
):
return
imperative_base
.
to_variable
(
x
,
self
.
main_program
.
current_block
())
def
append_op
(
self
,
*
args
,
**
kwargs
):
def
append_op
(
self
,
*
args
,
**
kwargs
):
return
self
.
main_program
.
current_block
().
append_op
(
*
args
,
**
kwargs
)
return
self
.
main_program
.
current_block
().
append_op
(
*
args
,
**
kwargs
)
...
@@ -82,6 +66,7 @@ class LayerHelper(object):
...
@@ -82,6 +66,7 @@ class LayerHelper(object):
def
bias_attr
(
self
):
def
bias_attr
(
self
):
return
ParamAttr
.
_to_attr
(
self
.
kwargs
.
get
(
'bias_attr'
,
None
))
return
ParamAttr
.
_to_attr
(
self
.
kwargs
.
get
(
'bias_attr'
,
None
))
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of param_attr
def
multiple_param_attr
(
self
,
length
):
def
multiple_param_attr
(
self
,
length
):
param_attr
=
self
.
param_attr
param_attr
=
self
.
param_attr
if
isinstance
(
param_attr
,
ParamAttr
):
if
isinstance
(
param_attr
,
ParamAttr
):
...
@@ -113,297 +98,13 @@ class LayerHelper(object):
...
@@ -113,297 +98,13 @@ class LayerHelper(object):
(
dtype
,
each
.
dtype
))
(
dtype
,
each
.
dtype
))
return
dtype
return
dtype
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
,
reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def
__norm_op
(
x
,
out
=
None
,
p
=
2
,
dim
=
None
,
keep_dim
=
False
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
abs_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_abs'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'abs'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
abs_out
})
pow_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_pow'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
abs_out
},
outputs
=
{
'Out'
:
pow_out
},
attrs
=
{
'factor'
:
float
(
p
)})
sum_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_sum'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reduce_sum'
,
inputs
=
{
'X'
:
pow_out
},
outputs
=
{
'Out'
:
sum_out
},
attrs
=
{
'dim'
:
dim
,
'keep_dim'
:
keep_dim
,
'reduce_all'
:
True
if
dim
is
None
else
False
})
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
sum_out
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'factor'
:
1.
/
p
})
return
out
def
__reshape_op
(
x
,
shape
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_reshape'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reshape'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'shape'
:
shape
})
return
out
def
__transpose_op
(
x
,
axis
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_transpose'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'transpose'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'axis'
:
axis
})
return
out
def
__norm_except_dim
(
x
,
out
=
None
,
dim
=
None
,
block
=
self
.
startup_program
.
global_block
()):
"""Computes the norm over all dimensions except dim"""
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
if
dim
is
None
:
__norm_op
(
x
,
out
,
dim
=
dim
,
block
=
block
)
elif
dim
==
0
:
out_shape
=
[
x
.
shape
[
0
]]
+
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
reshape
=
__reshape_op
(
x
,
shape
=
[
x
.
shape
[
0
],
-
1
],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
1
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
elif
dim
==
len
(
x
.
shape
)
-
1
:
out_shape
=
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
+
[
x
.
shape
[
-
1
]]
reshape
=
__reshape_op
(
x
,
shape
=
[
-
1
,
x
.
shape
[
-
1
]],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
0
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
else
:
perm
=
list
(
range
(
len
(
x
.
shape
)))
perm
[
0
],
perm
[
dim
]
=
dim
,
0
transpose
=
__transpose_op
(
x
,
perm
,
block
=
block
)
norm
=
__norm_op
(
transpose
,
dim
=
0
,
block
=
block
)
__transpose_op
(
norm
,
perm
,
out
=
out
,
block
=
block
)
return
out
def
__weight_normalize
(
g
,
v
,
dim
):
"""Calculations for weight normalization"""
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
())
scale
=
elementwise_div
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w
=
elementwise_mul
(
x
=
v
,
y
=
scale
if
dim
is
None
else
reshape
(
x
=
scale
,
shape
=
[
v
.
shape
[
dim
]]),
axis
=-
1
if
dim
is
None
else
dim
)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return
w
g_param_attr
=
copy
.
deepcopy
(
attr
)
g_param_attr
.
name
=
attr
.
name
+
'_g'
g_param_shape
=
[
1
]
*
len
(
shape
)
if
attr
.
dim
is
not
None
:
g_param_shape
[
attr
.
dim
]
=
shape
[
attr
.
dim
]
v_param_attr
=
copy
.
deepcopy
(
attr
)
v_param_attr
.
name
=
attr
.
name
+
'_v'
v_param_shape
=
shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
(
with_initializer
=
False
))
v_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
(
with_initializer
=
True
))
__norm_except_dim
(
x
=
v_param
,
out
=
g_param
,
dim
=
attr
.
dim
,
block
=
self
.
startup_program
.
global_block
())
# Add weight normalization to main_program
g_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
())
v_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
())
w_param
=
__weight_normalize
(
g_param
,
v_param
,
dim
=
attr
.
dim
)
return
w_param
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
# Deepcopy the attr so that parameters can be shared in program
attr
=
copy
.
deepcopy
(
attr
)
assert
isinstance
(
attr
,
ParamAttr
)
suffix
=
'b'
if
is_bias
else
'w'
if
attr
.
name
is
None
:
attr
.
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
suffix
]))
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
:
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
else
:
if
not
(
dtype
.
startswith
(
"float"
)
or
dtype
==
"double"
):
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
if
is_bias
:
attr
.
_set_default_bias_initializer
()
else
:
attr
.
_set_default_param_initializer
()
else
:
attr
.
_set_default_initializer
(
default_initializer
)
# If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if
isinstance
(
attr
,
WeightNormParamAttr
):
param
=
self
.
_create_weight_normalize
(
attr
,
shape
,
dtype
)
WeightNormParamAttr
.
params_with_weight_norm
.
append
(
param
)
return
param
if
_in_imperative_mode
():
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
else
:
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
())
def
get_parameter
(
self
,
name
):
def
get_parameter
(
self
,
name
):
param
=
self
.
main_program
.
global_block
().
var
(
name
)
param
=
self
.
main_program
.
global_block
().
var
(
name
)
if
not
isinstance
(
param
,
Parameter
):
if
not
isinstance
(
param
,
Parameter
):
raise
ValueError
(
"no Parameter name %s found"
%
name
)
raise
ValueError
(
"no Parameter name %s found"
%
name
)
return
param
return
param
def
create_variable_for_type_inference
(
self
,
dtype
,
stop_gradient
=
False
):
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of bias_attr
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return
self
.
main_program
.
current_block
().
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
'tmp'
])),
dtype
=
dtype
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
stop_gradient
)
def
create_variable
(
self
,
*
args
,
**
kwargs
):
return
self
.
main_program
.
current_block
().
create_var
(
*
args
,
**
kwargs
)
def
create_global_variable
(
self
,
persistable
=
False
,
*
args
,
**
kwargs
):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return
self
.
main_program
.
global_block
().
create_var
(
*
args
,
persistable
=
persistable
,
**
kwargs
)
def
create_or_get_global_variable
(
self
,
name
,
*
args
,
**
kwargs
):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if
self
.
main_program
.
global_block
().
has_var
(
name
):
return
self
.
main_program
.
global_block
().
var
(
name
),
False
else
:
return
self
.
create_global_variable
(
name
=
name
,
*
args
,
**
kwargs
),
True
def
set_variable_initializer
(
self
,
var
,
initializer
):
assert
isinstance
(
var
,
Variable
)
if
imperative_base
.
enabled
():
initializer
(
var
,
var
.
block
)
else
:
self
.
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
persistable
=
True
,
initializer
=
initializer
)
def
append_bias_op
(
self
,
input_var
,
dim_start
=
1
,
dim_end
=
None
):
def
append_bias_op
(
self
,
input_var
,
dim_start
=
1
,
dim_end
=
None
):
"""
"""
Append bias operator and return its output. If the user does not set
Append bias operator and return its output. If the user does not set
...
@@ -434,6 +135,7 @@ class LayerHelper(object):
...
@@ -434,6 +135,7 @@ class LayerHelper(object):
attrs
=
{
'axis'
:
dim_start
})
attrs
=
{
'axis'
:
dim_start
})
return
tmp
return
tmp
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act
def
append_activation
(
self
,
input_var
):
def
append_activation
(
self
,
input_var
):
act
=
self
.
kwargs
.
get
(
'act'
,
None
)
act
=
self
.
kwargs
.
get
(
'act'
,
None
)
if
act
is
None
:
if
act
is
None
:
...
@@ -448,10 +150,11 @@ class LayerHelper(object):
...
@@ -448,10 +150,11 @@ class LayerHelper(object):
if
'use_mkldnn'
in
self
.
kwargs
:
if
'use_mkldnn'
in
self
.
kwargs
:
act
[
'use_mkldnn'
]
=
self
.
kwargs
.
get
(
'use_mkldnn'
)
act
[
'use_mkldnn'
]
=
self
.
kwargs
.
get
(
'use_mkldnn'
)
act_type
=
act
.
pop
(
'type'
)
act_type
=
act
.
pop
(
'type'
)
tmp
=
input_var
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if
not
imperative_base
.
enabled
()
and
core
.
IsInplace
(
act_type
):
if
not
_in_imperative_mode
()
and
core
.
IsInplace
(
act_type
):
tmp
=
input_var
tmp
=
input_var
else
:
else
:
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
...
@@ -462,6 +165,7 @@ class LayerHelper(object):
...
@@ -462,6 +165,7 @@ class LayerHelper(object):
attrs
=
act
)
attrs
=
act
)
return
tmp
return
tmp
#TODO (jiabin): should we remove this since it has never be used
def
_get_default_initializer
(
self
,
dtype
):
def
_get_default_initializer
(
self
,
dtype
):
if
dtype
is
None
or
dtype_is_floating
(
dtype
)
is
True
:
if
dtype
is
None
or
dtype_is_floating
(
dtype
)
is
True
:
return
Xavier
()
return
Xavier
()
...
@@ -469,6 +173,7 @@ class LayerHelper(object):
...
@@ -469,6 +173,7 @@ class LayerHelper(object):
# For integer and boolean types, initialize with all zeros
# For integer and boolean types, initialize with all zeros
return
Constant
()
return
Constant
()
#TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
def
is_instance
(
self
,
param_name
,
cls
):
def
is_instance
(
self
,
param_name
,
cls
):
param
=
self
.
kwargs
.
get
(
param_name
,
None
)
param
=
self
.
kwargs
.
get
(
param_name
,
None
)
if
not
isinstance
(
param
,
cls
):
if
not
isinstance
(
param
,
cls
):
...
...
python/paddle/fluid/layer_helper_base.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
copy
import
numpy
as
np
from
.framework
import
Variable
,
default_main_program
,
default_startup_program
,
_in_imperative_mode
,
_current_expected_place
from
.
import
unique_name
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.
import
core
class
LayerHelperBase
(
object
):
def
__init__
(
self
,
name
,
layer_type
):
self
.
_layer_type
=
layer_type
self
.
_name
=
name
@
property
def
name
(
self
):
return
self
.
_name
@
property
def
layer_type
(
self
):
return
self
.
_layer_type
@
property
def
main_program
(
self
):
return
default_main_program
()
@
property
def
startup_program
(
self
):
return
default_startup_program
()
def
to_variable
(
self
,
value
,
block
=
None
):
"""convert value to variable
Args:
value: value to be convert
block: the block of the variable
Return Variable construct from value
"""
if
isinstance
(
value
,
np
.
ndarray
):
assert
_in_imperative_mode
(
),
"to_variable could only be called in imperative mode"
if
not
block
:
block
=
default_main_program
().
current_block
()
py_var
=
Variable
(
block
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
name
=
None
,
shape
=
value
.
shape
,
dtype
=
value
.
dtype
)
var
=
py_var
.
_ivar
.
value
()
tensor
=
var
.
get_tensor
()
tensor
.
set
(
value
,
_current_expected_place
())
return
py_var
elif
isinstance
(
value
,
Variable
):
return
value
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
,
reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def
__norm_op
(
x
,
out
=
None
,
p
=
2
,
dim
=
None
,
keep_dim
=
False
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
abs_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_abs'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'abs'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
abs_out
})
pow_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_pow'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
abs_out
},
outputs
=
{
'Out'
:
pow_out
},
attrs
=
{
'factor'
:
float
(
p
)})
sum_out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_sum'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reduce_sum'
,
inputs
=
{
'X'
:
pow_out
},
outputs
=
{
'Out'
:
sum_out
},
attrs
=
{
'dim'
:
dim
,
'keep_dim'
:
keep_dim
,
'reduce_all'
:
True
if
dim
is
None
else
False
})
block
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
sum_out
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'factor'
:
1.
/
p
})
return
out
def
__reshape_op
(
x
,
shape
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_reshape'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'reshape'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'shape'
:
shape
})
return
out
def
__transpose_op
(
x
,
axis
,
out
=
None
,
block
=
self
.
startup_program
.
global_block
()):
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_transpose'
])),
dtype
=
dtype
,
persistable
=
False
)
block
.
append_op
(
type
=
'transpose'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'axis'
:
axis
})
return
out
def
__norm_except_dim
(
x
,
out
=
None
,
dim
=
None
,
block
=
self
.
startup_program
.
global_block
()):
"""Computes the norm over all dimensions except dim"""
if
out
is
None
:
out
=
block
.
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
(
[
self
.
name
,
'weight_norm_norm'
])),
dtype
=
dtype
,
persistable
=
False
)
if
dim
is
None
:
__norm_op
(
x
,
out
,
dim
=
dim
,
block
=
block
)
elif
dim
==
0
:
out_shape
=
[
x
.
shape
[
0
]]
+
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
reshape
=
__reshape_op
(
x
,
shape
=
[
x
.
shape
[
0
],
-
1
],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
1
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
elif
dim
==
len
(
x
.
shape
)
-
1
:
out_shape
=
[
1
]
*
(
len
(
x
.
shape
)
-
1
)
+
[
x
.
shape
[
-
1
]]
reshape
=
__reshape_op
(
x
,
shape
=
[
-
1
,
x
.
shape
[
-
1
]],
block
=
block
)
norm
=
__norm_op
(
reshape
,
dim
=
0
,
block
=
block
)
__reshape_op
(
norm
,
out
=
out
,
shape
=
out_shape
,
block
=
block
)
else
:
perm
=
list
(
range
(
len
(
x
.
shape
)))
perm
[
0
],
perm
[
dim
]
=
dim
,
0
transpose
=
__transpose_op
(
x
,
perm
,
block
=
block
)
norm
=
__norm_op
(
transpose
,
dim
=
0
,
block
=
block
)
__transpose_op
(
norm
,
perm
,
out
=
out
,
block
=
block
)
return
out
def
__weight_normalize
(
g
,
v
,
dim
):
"""Calculations for weight normalization"""
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
())
scale
=
elementwise_div
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w
=
elementwise_mul
(
x
=
v
,
y
=
scale
if
dim
is
None
else
reshape
(
x
=
scale
,
shape
=
[
v
.
shape
[
dim
]]),
axis
=-
1
if
dim
is
None
else
dim
)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return
w
g_param_attr
=
copy
.
deepcopy
(
attr
)
g_param_attr
.
name
=
attr
.
name
+
'_g'
g_param_shape
=
[
1
]
*
len
(
shape
)
if
attr
.
dim
is
not
None
:
g_param_shape
[
attr
.
dim
]
=
shape
[
attr
.
dim
]
v_param_attr
=
copy
.
deepcopy
(
attr
)
v_param_attr
.
name
=
attr
.
name
+
'_v'
v_param_shape
=
shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
(
with_initializer
=
False
))
v_param
=
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
(
with_initializer
=
True
))
__norm_except_dim
(
x
=
v_param
,
out
=
g_param
,
dim
=
attr
.
dim
,
block
=
self
.
startup_program
.
global_block
())
# Add weight normalization to main_program
g_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
g_param_shape
,
**
g_param_attr
.
_to_kwargs
())
v_param
=
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
v_param_shape
,
**
v_param_attr
.
_to_kwargs
())
w_param
=
__weight_normalize
(
g_param
,
v_param
,
dim
=
attr
.
dim
)
return
w_param
# TODO: hide the func after we move the layers to Layers
def
create_parameter
(
self
,
attr
,
shape
,
dtype
,
is_bias
=
False
,
default_initializer
=
None
):
"""Create parameters for this layers.
Args:
attr: [ParamAttr] should be the parameter attribute for this parameter
shape: shape of the paramter
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
# Deepcopy the attr so that parameters can be shared in program
attr
=
copy
.
deepcopy
(
attr
)
if
attr
is
None
:
attr
=
ParamAttr
.
_to_attr
(
attr
)
assert
isinstance
(
attr
,
ParamAttr
)
suffix
=
'b'
if
is_bias
else
'w'
if
attr
.
name
is
None
:
attr
.
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
suffix
]))
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
:
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
else
:
if
not
(
dtype
.
startswith
(
"float"
)
or
dtype
==
"double"
):
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
if
is_bias
:
attr
.
_set_default_bias_initializer
()
else
:
attr
.
_set_default_param_initializer
()
else
:
attr
.
_set_default_initializer
(
default_initializer
)
# If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if
isinstance
(
attr
,
WeightNormParamAttr
):
param
=
self
.
_create_weight_normalize
(
attr
,
shape
,
dtype
)
WeightNormParamAttr
.
params_with_weight_norm
.
append
(
param
)
return
param
if
_in_imperative_mode
():
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
else
:
self
.
startup_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
(
with_initializer
=
True
))
return
self
.
main_program
.
global_block
().
create_parameter
(
dtype
=
dtype
,
shape
=
shape
,
**
attr
.
_to_kwargs
())
def
create_variable_for_type_inference
(
self
,
dtype
,
stop_gradient
=
False
):
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return
self
.
main_program
.
current_block
().
create_var
(
name
=
unique_name
.
generate
(
"."
.
join
([
self
.
name
,
'tmp'
])),
dtype
=
dtype
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
stop_gradient
)
def
create_variable
(
self
,
*
args
,
**
kwargs
):
"""Create Variable for this layers.
Returns created Variable.
"""
return
self
.
main_program
.
current_block
().
create_var
(
*
args
,
**
kwargs
)
def
create_global_variable
(
self
,
persistable
=
False
,
*
args
,
**
kwargs
):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return
self
.
main_program
.
global_block
().
create_var
(
*
args
,
persistable
=
persistable
,
**
kwargs
)
def
create_or_get_global_variable
(
self
,
name
,
*
args
,
**
kwargs
):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if
self
.
main_program
.
global_block
().
has_var
(
name
):
return
self
.
main_program
.
global_block
().
var
(
name
),
False
else
:
return
self
.
create_global_variable
(
name
=
name
,
*
args
,
**
kwargs
),
True
def
set_variable_initializer
(
self
,
var
,
initializer
):
"""Set target Variable's initializer
Args:
var: target Variable
initializer: initializer to use
"""
assert
isinstance
(
var
,
Variable
)
if
_in_imperative_mode
():
initializer
(
var
,
var
.
block
)
else
:
self
.
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
persistable
=
True
,
initializer
=
initializer
)
python/paddle/fluid/layers/control_flow.py
浏览文件 @
2c4fcaa6
...
@@ -848,7 +848,7 @@ def create_array(dtype):
...
@@ -848,7 +848,7 @@ def create_array(dtype):
@
templatedoc
()
@
templatedoc
()
def
less_than
(
x
,
y
,
force_cpu
=
None
,
cond
=
None
,
**
ignored
):
def
less_than
(
x
,
y
,
force_cpu
=
None
,
cond
=
None
):
"""
"""
${comment}
${comment}
...
@@ -1800,7 +1800,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
...
@@ -1800,7 +1800,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
return
out
return
out
def
is_empty
(
x
,
cond
=
None
,
**
ignored
):
def
is_empty
(
x
,
cond
=
None
):
"""
"""
Test whether a Variable is empty.
Test whether a Variable is empty.
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
2c4fcaa6
...
@@ -51,6 +51,8 @@ __all__ = [
...
@@ -51,6 +51,8 @@ __all__ = [
'yolov3_loss'
,
'yolov3_loss'
,
'box_clip'
,
'box_clip'
,
'multiclass_nms'
,
'multiclass_nms'
,
'distribute_fpn_proposals'
,
'box_decoder_and_assign'
,
]
]
...
@@ -2221,3 +2223,138 @@ def multiclass_nms(bboxes,
...
@@ -2221,3 +2223,138 @@ def multiclass_nms(bboxes,
output
.
stop_gradient
=
True
output
.
stop_gradient
=
True
return
output
return
output
def
distribute_fpn_proposals
(
fpn_rois
,
min_level
,
max_level
,
refer_level
,
refer_scale
,
name
=
None
):
"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute all
proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order
of proposals, we return an array which indicates the original index of rois
in current proposals. To compute FPN level for each roi, the formula is
given as follows:
.. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
level = floor(&\log(
\\
frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi.
Args:
fpn_rois(variable): The input fpn_rois, the second dimension is 4.
min_level(int): The lowest level of FPN layer where the proposals come
from.
max_level(int): The highest level of FPN layer where the proposals
come from.
refer_level(int): The referring level of FPN layer with specified scale.
refer_scale(int): The referring scale of FPN layer with specified level.
name(str|None): The name of this operator.
Returns:
tuple:
A tuple(multi_rois, restore_ind) is returned. The multi_rois is
a list of segmented tensor variables. The restore_ind is a 2D
Tensor with shape [N, 1], N is the number of total rois. It is
used to restore the order of fpn_rois.
Examples:
.. code-block:: python
fpn_rois = fluid.layers.data(
name='data', shape=[4], dtype='float32', lod_level=1)
multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
fpn_rois=fpn_rois,
min_level=2,
max_level=5,
refer_level=4,
refer_scale=224)
"""
helper
=
LayerHelper
(
'distribute_fpn_proposals'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
num_lvl
=
max_level
-
min_level
+
1
multi_rois
=
[
helper
.
create_variable_for_type_inference
(
dtype
)
for
i
in
range
(
num_lvl
)
]
restore_ind
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int32'
)
helper
.
append_op
(
type
=
'distribute_fpn_proposals'
,
inputs
=
{
'FpnRois'
:
fpn_rois
},
outputs
=
{
'MultiFpnRois'
:
multi_rois
,
'RestoreIndex'
:
restore_ind
},
attrs
=
{
'min_level'
:
min_level
,
'max_level'
:
max_level
,
'refer_level'
:
refer_level
,
'refer_scale'
:
refer_scale
})
return
multi_rois
,
restore_ind
@
templatedoc
()
def
box_decoder_and_assign
(
prior_box
,
prior_box_var
,
target_box
,
box_score
,
box_clip
,
name
=
None
):
"""
${comment}
Args:
prior_box(${prior_box_type}): ${prior_box_comment}
prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
target_box(${target_box_type}): ${target_box_comment}
box_score(${box_score_type}): ${box_score_comment}
box_clip(${box_clip_type}): ${box_clip_comment}
name(str|None): The name of this operator
Returns:
decode_box(Variable), output_assign_box(Variable):
two variables:
- decode_box(${decode_box_type}): ${decode_box_comment}
- output_assign_box(${output_assign_box_type}): ${output_assign_box_comment}
Examples:
.. code-block:: python
pb = fluid.layers.data(
name='prior_box', shape=[20, 4], dtype='float32')
pbv = fluid.layers.data(
name='prior_box_var', shape=[1, 4], dtype='float32')
loc = fluid.layers.data(
name='target_box', shape=[20, 4*81], dtype='float32')
scores = fluid.layers.data(
name='scores', shape=[20, 81], dtype='float32')
decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
pb, pbv, loc, scores, 4.135)
"""
helper
=
LayerHelper
(
"box_decoder_and_assign"
,
**
locals
())
decoded_box
=
helper
.
create_variable_for_type_inference
(
dtype
=
prior_box
.
dtype
)
output_assign_box
=
helper
.
create_variable_for_type_inference
(
dtype
=
prior_box
.
dtype
)
helper
.
append_op
(
type
=
"box_decoder_and_assign"
,
inputs
=
{
"PriorBox"
:
prior_box
,
"PriorBoxVar"
:
prior_box_var
,
"TargetBox"
:
target_box
,
"BoxScore"
:
box_score
},
attrs
=
{
"box_clip"
:
box_clip
},
outputs
=
{
"DecodeBox"
:
decoded_box
,
"OutputAssignBox"
:
output_assign_box
})
return
decoded_box
,
output_assign_box
python/paddle/fluid/layers/nn.py
浏览文件 @
2c4fcaa6
...
@@ -4833,11 +4833,6 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
...
@@ -4833,11 +4833,6 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
"""
"""
def
__check_input
(
x
,
y
):
def
__check_input
(
x
,
y
):
if
len
(
y
.
shape
)
>
len
(
x
.
shape
):
raise
ValueError
(
"Invalid inputs for matmul. "
"x's rank should be always greater than or equal to y'rank."
)
x_shape
=
list
(
x
.
shape
)
x_shape
=
list
(
x
.
shape
)
y_shape
=
list
(
y
.
shape
)
y_shape
=
list
(
y
.
shape
)
if
len
(
x_shape
)
==
1
:
if
len
(
x_shape
)
==
1
:
...
@@ -4853,10 +4848,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
...
@@ -4853,10 +4848,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
if
x_shape
[
-
1
]
!=
y_shape
[
-
2
]:
if
x_shape
[
-
1
]
!=
y_shape
[
-
2
]:
raise
ValueError
(
"Invalid inputs for matmul."
)
raise
ValueError
(
"Invalid inputs for matmul."
)
if
len
(
y_shape
)
>
2
:
if
len
(
y_shape
)
>
2
and
len
(
x_shape
)
>
2
:
for
i
,
dim_x
in
enumerate
(
x_shape
[:
-
2
]):
for
i
,
dim_x
in
enumerate
(
x_shape
[:
-
2
]):
if
dim_x
!=
y_shape
[
i
]:
if
dim_x
!=
y_shape
[
i
]:
raise
ValueError
(
"Invalid inputs for matmul."
)
raise
ValueError
(
"Invalid inputs for matmul. x(%s), y(%s)"
%
(
x
.
shape
,
y
.
shape
))
__check_input
(
x
,
y
)
__check_input
(
x
,
y
)
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
2c4fcaa6
...
@@ -142,7 +142,8 @@ def create_global_var(shape,
...
@@ -142,7 +142,8 @@ def create_global_var(shape,
def
cast
(
x
,
dtype
):
def
cast
(
x
,
dtype
):
"""
"""
This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
it to the output with :attr:`dtype`.
it to the output with :attr:`dtype`. It's meaningless if the output
dtype equals the input dtype, but it's fine if you do so.
Args:
Args:
x (Variable): The input Variable for casting.
x (Variable): The input Variable for casting.
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
2c4fcaa6
...
@@ -379,7 +379,7 @@ class Optimizer(object):
...
@@ -379,7 +379,7 @@ class Optimizer(object):
self
.
_dtype
=
loss
.
dtype
self
.
_dtype
=
loss
.
dtype
program
=
loss
.
block
.
program
program
=
loss
.
block
.
program
optimize_ops
=
[]
optimize_ops
=
[]
if
imperative_base
.
enabled
():
if
framework
.
_in_imperative_mode
():
if
parameter_list
is
not
None
:
if
parameter_list
is
not
None
:
parameters
=
parameter_list
parameters
=
parameter_list
else
:
else
:
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
2c4fcaa6
...
@@ -106,13 +106,18 @@ class ParallelExecutor(object):
...
@@ -106,13 +106,18 @@ class ParallelExecutor(object):
else
framework
.
default_main_program
()
else
framework
.
default_main_program
()
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
if
share_vars_from
:
assert
isinstance
(
share_vars_from
,
ParallelExecutor
),
"The share_vars_from should be ParallelExecutor."
self
.
_compiled_program
.
with_data_parallel
(
self
.
_compiled_program
.
with_data_parallel
(
loss_name
=
loss_name
,
loss_name
=
loss_name
,
build_strategy
=
build_strategy
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
,
exec_strategy
=
exec_strategy
,
share_vars_from
=
share_vars_from
)
share_vars_from
=
share_vars_from
.
_compiled_program
if
share_vars_from
else
None
)
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_exe
cutor
=
executor
.
Executor
(
self
.
_place
)
self
.
_exe
=
executor
.
Executor
(
self
.
_place
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
...
@@ -180,11 +185,11 @@ class ParallelExecutor(object):
...
@@ -180,11 +185,11 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch),
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
fetch_list=[avg_cost.name]))
"""
"""
return
self
.
_exe
cutor
.
run
(
program
=
self
.
_compiled_program
,
return
self
.
_exe
.
run
(
program
=
self
.
_compiled_program
,
scope
=
self
.
_scope
,
scope
=
self
.
_scope
,
feed
=
feed
,
feed
=
feed
,
fetch_list
=
fetch_list
,
fetch_list
=
fetch_list
,
return_numpy
=
return_numpy
)
return_numpy
=
return_numpy
)
@
property
@
property
def
device_count
(
self
):
def
device_count
(
self
):
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
2c4fcaa6
...
@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase):
...
@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
self
.
assertIsNotNone
(
output
)
class
TestDistributeFpnProposals
(
unittest
.
TestCase
):
def
test_distribute_fpn_proposals
(
self
):
program
=
Program
()
with
program_guard
(
program
):
fpn_rois
=
fluid
.
layers
.
data
(
name
=
'data'
,
shape
=
[
4
],
dtype
=
'float32'
,
lod_level
=
1
)
multi_rois
,
restore_ind
=
layers
.
distribute_fpn_proposals
(
fpn_rois
=
fpn_rois
,
min_level
=
2
,
max_level
=
5
,
refer_level
=
4
,
refer_scale
=
224
)
self
.
assertIsNotNone
(
multi_rois
)
self
.
assertIsNotNone
(
restore_ind
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
浏览文件 @
2c4fcaa6
...
@@ -70,3 +70,17 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
...
@@ -70,3 +70,17 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
fetch_list
=
[
'x@GRAD'
,
'out'
])
fetch_list
=
[
'x@GRAD'
,
'out'
])
__assert_close
(
x_grad
,
out
[
0
],
'x@GRAD'
)
__assert_close
(
x_grad
,
out
[
0
],
'x@GRAD'
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
j
in
range
(
out_w
):
for
m
in
range
(
out_c
):
out_tmp
[
n
,
i
,
j
,
m
]
=
out
[
n
,
m
,
i
,
j
]
return
out_tmp
.
reshape
(
in_n
,
out_c
,
out_h
,
out_w
)
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
浏览文件 @
2c4fcaa6
...
@@ -20,6 +20,7 @@ import numpy as np
...
@@ -20,6 +20,7 @@ import numpy as np
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_conv2d_op
import
conv2d_forward_naive
,
TestConv2dOp
from
paddle.fluid.tests.unittests.test_conv2d_op
import
conv2d_forward_naive
,
TestConv2dOp
from
mkldnn_op_test
import
format_reorder
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
...
@@ -29,20 +30,6 @@ def conv2d_forward_refer(input, filter, group, conv_param):
...
@@ -29,20 +30,6 @@ def conv2d_forward_refer(input, filter, group, conv_param):
return
format_reorder
(
out
,
size
)
return
format_reorder
(
out
,
size
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
j
in
range
(
out_w
):
for
m
in
range
(
out_c
):
out_tmp
[
n
,
i
,
j
,
m
]
=
out
[
n
,
m
,
i
,
j
]
return
out_tmp
.
reshape
(
in_n
,
out_c
,
out_h
,
out_w
)
class
TestConv2dInt8Op
(
TestConv2dOp
):
class
TestConv2dInt8Op
(
TestConv2dOp
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
self
.
op_type
=
"conv2d"
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
mkldnn_op_test
import
format_reorder
class
TestReQuantizeOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
'requantize'
self
.
scale_in
=
2.0
self
.
scale_out
=
1.5
self
.
input_size
=
[
1
,
1
,
5
,
5
]
self
.
data_type
=
'int8'
self
.
set_scale
()
self
.
set_data_type
()
scale_shift
=
self
.
scale_out
/
self
.
scale_in
if
self
.
data_type
==
'int8'
:
input
=
(
np
.
random
.
randint
(
0
,
100
,
self
.
input_size
)
-
50
).
astype
(
self
.
data_type
)
output_tmp
=
np
.
round
(
input
.
astype
(
'float32'
)
*
scale_shift
).
astype
(
'int8'
)
else
:
input
=
(
np
.
random
.
randint
(
0
,
100
,
self
.
input_size
)).
astype
(
self
.
data_type
)
output_tmp
=
np
.
round
(
input
.
astype
(
'float32'
)
*
scale_shift
).
astype
(
'uint8'
)
output
=
format_reorder
(
output_tmp
,
self
.
input_size
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
outputs
=
{
'Output'
:
output
}
self
.
attrs
=
{
'Scale_in'
:
self
.
scale_in
,
'Scale_out'
:
self
.
scale_out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
set_scale
(
self
):
pass
def
set_data_type
(
OpTest
):
pass
#--------------------test requantize with s8 input--------------------
class
TestReQuantizeOp1
(
TestReQuantizeOp
):
def
set_scale
(
self
):
self
.
scale_in
=
1.5
self
.
scale_out
=
1.5
class
TestReQuantizeOp2
(
TestReQuantizeOp
):
def
set_scale
(
self
):
self
.
scale_in
=
0.1
self
.
scale_out
=
0.2
#--------------------test requantize with u8 input--------------------
class
TestReQuantizeOp3
(
TestReQuantizeOp1
):
def
set_data_type
(
self
):
self
.
data_type
=
'uint8'
class
TestReQuantizeOp4
(
TestReQuantizeOp2
):
def
set_data_type
(
self
):
self
.
data_type
=
'uint8'
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_base_layer.py
浏览文件 @
2c4fcaa6
...
@@ -16,27 +16,17 @@ import unittest
...
@@ -16,27 +16,17 @@ import unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid.layer_helper
import
LayerHelper
class
L1
(
fluid
.
imperative
.
Layer
):
class
L1
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
prefix
):
def
__init__
(
self
,
prefix
):
super
(
L1
,
self
).
__init__
(
prefix
)
super
(
L1
,
self
).
__init__
(
prefix
)
self
.
_helper
=
LayerHelper
(
self
.
_param_attr
=
fluid
.
ParamAttr
(
self
.
full_name
(),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
))
param_attr
=
fluid
.
ParamAttr
(
self
.
w1
=
self
.
create_parameter
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
attr
=
self
.
_param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
self
.
w2
=
self
.
create_parameter
(
self
.
w1
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
attr
=
self
.
_helper
.
param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
self
.
w2
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
shape
=
[
2
,
2
],
dtype
=
'float32'
,
is_bias
=
False
)
def
forward
(
self
):
def
forward
(
self
):
return
self
.
w1
+
self
.
w2
return
self
.
w1
+
self
.
w2
...
@@ -67,8 +57,8 @@ class TestBaseLayer(unittest.TestCase):
...
@@ -67,8 +57,8 @@ class TestBaseLayer(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
l
=
L1
(
'test_one_level'
)
l
=
L1
(
'test_one_level'
)
ret
=
l
()
ret
=
l
()
self
.
assertEqual
(
l
.
w1
.
name
,
"test_one_level/L1_0
_0
.w_0"
)
self
.
assertEqual
(
l
.
w1
.
name
,
"test_one_level/L1_0.w_0"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"test_one_level/L1_0
_0
.w_1"
)
self
.
assertEqual
(
l
.
w2
.
name
,
"test_one_level/L1_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.2
*
np
.
ones
([
2
,
2
])))
def
test_three_level
(
self
):
def
test_three_level
(
self
):
...
@@ -76,12 +66,12 @@ class TestBaseLayer(unittest.TestCase):
...
@@ -76,12 +66,12 @@ class TestBaseLayer(unittest.TestCase):
l
=
L3
(
'test_three_level'
)
l
=
L3
(
'test_three_level'
)
names
=
[
p
.
name
for
p
in
l
.
parameters
()]
names
=
[
p
.
name
for
p
in
l
.
parameters
()]
ret
=
l
()
ret
=
l
()
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0
_0
.w_0"
)
self
.
assertEqual
(
names
[
0
],
"test_three_level/L3_0/L2_0/L1_0.w_0"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0
_0
.w_1"
)
self
.
assertEqual
(
names
[
1
],
"test_three_level/L3_0/L2_0/L1_0.w_1"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1
_0
.w_0"
)
self
.
assertEqual
(
names
[
2
],
"test_three_level/L3_0/L2_0/L1_1.w_0"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1
_0
.w_1"
)
self
.
assertEqual
(
names
[
3
],
"test_three_level/L3_0/L2_0/L1_1.w_1"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0
_0
.w_0"
)
self
.
assertEqual
(
names
[
4
],
"test_three_level/L3_0/L2_1/L1_0.w_0"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0
_0
.w_1"
)
self
.
assertEqual
(
names
[
5
],
"test_three_level/L3_0/L2_1/L1_0.w_1"
)
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
0.8
*
np
.
ones
([
2
,
2
])))
...
...
python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
sys
import
math
from
op_test
import
OpTest
def
box_decoder_and_assign
(
deltas
,
weights
,
boxes
,
box_score
,
box_clip
):
boxes
=
boxes
.
astype
(
deltas
.
dtype
,
copy
=
False
)
widths
=
boxes
[:,
2
]
-
boxes
[:,
0
]
+
1.0
heights
=
boxes
[:,
3
]
-
boxes
[:,
1
]
+
1.0
ctr_x
=
boxes
[:,
0
]
+
0.5
*
widths
ctr_y
=
boxes
[:,
1
]
+
0.5
*
heights
wx
,
wy
,
ww
,
wh
=
weights
dx
=
deltas
[:,
0
::
4
]
*
wx
dy
=
deltas
[:,
1
::
4
]
*
wy
dw
=
deltas
[:,
2
::
4
]
*
ww
dh
=
deltas
[:,
3
::
4
]
*
wh
# Prevent sending too large values into np.exp()
dw
=
np
.
minimum
(
dw
,
box_clip
)
dh
=
np
.
minimum
(
dh
,
box_clip
)
pred_ctr_x
=
dx
*
widths
[:,
np
.
newaxis
]
+
ctr_x
[:,
np
.
newaxis
]
pred_ctr_y
=
dy
*
heights
[:,
np
.
newaxis
]
+
ctr_y
[:,
np
.
newaxis
]
pred_w
=
np
.
exp
(
dw
)
*
widths
[:,
np
.
newaxis
]
pred_h
=
np
.
exp
(
dh
)
*
heights
[:,
np
.
newaxis
]
pred_boxes
=
np
.
zeros
(
deltas
.
shape
,
dtype
=
deltas
.
dtype
)
# x1
pred_boxes
[:,
0
::
4
]
=
pred_ctr_x
-
0.5
*
pred_w
# y1
pred_boxes
[:,
1
::
4
]
=
pred_ctr_y
-
0.5
*
pred_h
# x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
pred_boxes
[:,
2
::
4
]
=
pred_ctr_x
+
0.5
*
pred_w
-
1
# y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
pred_boxes
[:,
3
::
4
]
=
pred_ctr_y
+
0.5
*
pred_h
-
1
output_assign_box
=
[]
for
ino
in
range
(
len
(
pred_boxes
)):
rank
=
np
.
argsort
(
-
box_score
[
ino
])
maxidx
=
rank
[
0
]
if
maxidx
==
0
:
maxidx
=
rank
[
1
]
beg_pos
=
maxidx
*
4
end_pos
=
maxidx
*
4
+
4
output_assign_box
.
append
(
pred_boxes
[
ino
,
beg_pos
:
end_pos
])
output_assign_box
=
np
.
array
(
output_assign_box
)
return
pred_boxes
,
output_assign_box
class
TestBoxDecoderAndAssignOpWithLoD
(
OpTest
):
def
test_check_output
(
self
):
self
.
check_output
()
def
setUp
(
self
):
self
.
op_type
=
"box_decoder_and_assign"
lod
=
[[
4
,
8
,
8
]]
num_classes
=
10
prior_box
=
np
.
random
.
random
((
20
,
4
)).
astype
(
'float32'
)
prior_box_var
=
np
.
array
([
0.1
,
0.1
,
0.2
,
0.2
],
dtype
=
np
.
float32
)
target_box
=
np
.
random
.
random
((
20
,
4
*
num_classes
)).
astype
(
'float32'
)
box_score
=
np
.
random
.
random
((
20
,
num_classes
)).
astype
(
'float32'
)
box_clip
=
4.135
output_box
,
output_assign_box
=
box_decoder_and_assign
(
target_box
,
prior_box_var
,
prior_box
,
box_score
,
box_clip
)
self
.
inputs
=
{
'PriorBox'
:
(
prior_box
,
lod
),
'PriorBoxVar'
:
prior_box_var
,
'TargetBox'
:
(
target_box
,
lod
),
'BoxScore'
:
(
box_score
,
lod
),
}
self
.
attrs
=
{
'box_clip'
:
box_clip
}
self
.
outputs
=
{
'DecodeBox'
:
output_box
,
'OutputAssignBox'
:
output_assign_box
}
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistMnistNCCL2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_nccl2_mode
=
True
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1
,
need_envs
=
{
"FLAGS_enable_parallel_graph"
:
"1"
,
"FLAGS_sync_nccl_allreduce"
:
"1"
})
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
0 → 100644
浏览文件 @
2c4fcaa6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
math
import
sys
from
op_test
import
OpTest
class
TestDistributeFPNProposalsOp
(
OpTest
):
def
set_data
(
self
):
self
.
init_test_case
()
self
.
make_rois
()
self
.
rois_fpn
,
self
.
rois_idx_restore
=
self
.
calc_rois_distribute
()
self
.
inputs
=
{
'FpnRois'
:
(
self
.
rois
[:,
1
:
5
],
self
.
rois_lod
)}
self
.
attrs
=
{
'max_level'
:
self
.
roi_max_level
,
'min_level'
:
self
.
roi_min_level
,
'refer_scale'
:
self
.
canonical_scale
,
'refer_level'
:
self
.
canonical_level
}
output
=
[(
'out%d'
%
i
,
self
.
rois_fpn
[
i
])
for
i
in
range
(
len
(
self
.
rois_fpn
))]
self
.
outputs
=
{
'MultiFpnRois'
:
output
,
'RestoreIndex'
:
self
.
rois_idx_restore
}
def
init_test_case
(
self
):
self
.
roi_max_level
=
5
self
.
roi_min_level
=
2
self
.
canonical_scale
=
224
self
.
canonical_level
=
4
self
.
images_shape
=
[
512
,
512
]
def
boxes_area
(
self
,
boxes
):
w
=
(
boxes
[:,
2
]
-
boxes
[:,
0
]
+
1
)
h
=
(
boxes
[:,
3
]
-
boxes
[:,
1
]
+
1
)
areas
=
w
*
h
assert
np
.
all
(
areas
>=
0
),
'Negative areas founds'
return
areas
def
map_rois_to_fpn_levels
(
self
,
rois
,
lvl_min
,
lvl_max
):
s
=
np
.
sqrt
(
self
.
boxes_area
(
rois
))
s0
=
self
.
canonical_scale
lvl0
=
self
.
canonical_level
target_lvls
=
np
.
floor
(
lvl0
+
np
.
log2
(
s
/
s0
+
1e-6
))
target_lvls
=
np
.
clip
(
target_lvls
,
lvl_min
,
lvl_max
)
return
target_lvls
def
get_sub_lod
(
self
,
sub_lvl
):
sub_lod
=
[]
max_batch_id
=
sub_lvl
[
-
1
]
for
i
in
range
(
max_batch_id
.
astype
(
np
.
int32
)
+
1
):
sub_lod
.
append
(
np
.
where
(
sub_lvl
==
i
)[
0
].
size
)
return
sub_lod
def
add_multilevel_roi
(
self
,
rois
,
target_lvls
,
lvl_min
,
lvl_max
):
rois_idx_order
=
np
.
empty
((
0
,
))
rois_fpn
=
[]
for
lvl
in
range
(
lvl_min
,
lvl_max
+
1
):
idx_lvl
=
np
.
where
(
target_lvls
==
lvl
)[
0
]
if
len
(
idx_lvl
)
==
0
:
rois_fpn
.
append
((
np
.
empty
(
shape
=
(
0
,
4
)),
[[
0
,
0
]]))
continue
sub_lod
=
self
.
get_sub_lod
(
rois
[
idx_lvl
,
0
])
rois_fpn
.
append
((
rois
[
idx_lvl
,
1
:],
[
sub_lod
]))
rois_idx_order
=
np
.
concatenate
((
rois_idx_order
,
idx_lvl
))
rois_idx_restore
=
np
.
argsort
(
rois_idx_order
).
astype
(
np
.
int32
,
copy
=
False
)
return
rois_fpn
,
rois_idx_restore
def
calc_rois_distribute
(
self
):
lvl_min
=
self
.
roi_min_level
lvl_max
=
self
.
roi_max_level
target_lvls
=
self
.
map_rois_to_fpn_levels
(
self
.
rois
[:,
1
:
5
],
lvl_min
,
lvl_max
)
rois_fpn
,
rois_idx_restore
=
self
.
add_multilevel_roi
(
self
.
rois
,
target_lvls
,
lvl_min
,
lvl_max
)
return
rois_fpn
,
rois_idx_restore
def
make_rois
(
self
):
self
.
rois_lod
=
[[
100
,
200
]]
rois
=
[]
lod
=
self
.
rois_lod
[
0
]
bno
=
0
for
roi_num
in
lod
:
for
i
in
range
(
roi_num
):
xywh
=
np
.
random
.
rand
(
4
)
xy1
=
xywh
[
0
:
2
]
*
20
wh
=
xywh
[
2
:
4
]
*
(
self
.
images_shape
-
xy1
)
xy2
=
xy1
+
wh
roi
=
[
bno
,
xy1
[
0
],
xy1
[
1
],
xy2
[
0
],
xy2
[
1
]]
rois
.
append
(
roi
)
bno
+=
1
self
.
rois
=
np
.
array
(
rois
).
astype
(
"float32"
)
def
setUp
(
self
):
self
.
op_type
=
"distribute_fpn_proposals"
self
.
set_data
()
def
test_check_output
(
self
):
self
.
check_output
()
python/paddle/fluid/tests/unittests/test_imperative_basic.py
浏览文件 @
2c4fcaa6
...
@@ -53,11 +53,15 @@ class MLP(fluid.imperative.Layer):
...
@@ -53,11 +53,15 @@ class MLP(fluid.imperative.Layer):
super
(
MLP
,
self
).
__init__
(
name_scope
)
super
(
MLP
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
self
.
full_name
(),
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
,
3
,
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc2
=
FC
(
self
.
full_name
(),
self
.
_fc2
=
FC
(
self
.
full_name
(),
4
,
4
,
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
...
@@ -74,41 +78,37 @@ class SimpleRNNCell(fluid.imperative.Layer):
...
@@ -74,41 +78,37 @@ class SimpleRNNCell(fluid.imperative.Layer):
self
.
step_input_size
=
step_input_size
self
.
step_input_size
=
step_input_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
output_size
=
output_size
self
.
output_size
=
output_size
self
.
_dype
=
core
.
VarDesc
.
VarType
.
FP32
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
param_attr
=
param_attr
self
.
_helper
=
LayerHelper
(
'SimpleRNNCell'
,
act
=
"tanh"
,
param_attr
=
param_attr
)
def
_build_once
(
self
,
inputs
,
pre_hidden
):
def
_build_once
(
self
,
inputs
,
pre_hidden
):
i2h_param_shape
=
[
self
.
step_input_size
,
self
.
hidden_size
]
i2h_param_shape
=
[
self
.
step_input_size
,
self
.
hidden_size
]
h2h_param_shape
=
[
self
.
hidden_size
,
self
.
hidden_size
]
h2h_param_shape
=
[
self
.
hidden_size
,
self
.
hidden_size
]
h2o_param_shape
=
[
self
.
output_size
,
self
.
hidden_size
]
h2o_param_shape
=
[
self
.
output_size
,
self
.
hidden_size
]
self
.
_i2h_w
=
self
.
_helper
.
create_parameter
(
self
.
_i2h_w
=
self
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
attr
=
self
.
param_attr
,
shape
=
i2h_param_shape
,
shape
=
i2h_param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
is_bias
=
False
)
self
.
_h2h_w
=
self
.
_helper
.
create_parameter
(
self
.
_h2h_w
=
self
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
attr
=
self
.
param_attr
,
shape
=
h2h_param_shape
,
shape
=
h2h_param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
is_bias
=
False
)
self
.
_h2o_w
=
self
.
_helper
.
create_parameter
(
self
.
_h2o_w
=
self
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
attr
=
self
.
param_attr
,
shape
=
h2o_param_shape
,
shape
=
h2o_param_shape
,
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
is_bias
=
False
)
def
forward
(
self
,
input
,
pre_hidden
):
def
forward
(
self
,
input
,
pre_hidden
):
tmp_i2h
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
tmp_i2h
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
tmp_h2h
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
tmp_h2h
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
hidden
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dype
)
hidden
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dype
)
out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
softmax_out
=
self
.
_helper
.
create_variable_for_type_inference
(
softmax_out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
self
.
_dtype
)
reduce_out
=
self
.
create_variable
(
dtype
=
self
.
_dtype
)
reduce_out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
self
.
_helper
.
append_op
(
type
=
"mul"
,
type
=
"mul"
,
inputs
=
{
"X"
:
input
,
inputs
=
{
"X"
:
input
,
...
@@ -132,7 +132,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
...
@@ -132,7 +132,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
outputs
=
{
'Out'
:
hidden
},
outputs
=
{
'Out'
:
hidden
},
attrs
=
{
'axis'
:
-
1
,
attrs
=
{
'axis'
:
-
1
,
'use_mkldnn'
:
False
})
'use_mkldnn'
:
False
})
hidden
=
self
.
_helper
.
append_activation
(
hidden
)
hidden
=
self
.
_helper
.
append_activation
(
hidden
,
act
=
'tanh'
)
self
.
_helper
.
append_op
(
self
.
_helper
.
append_op
(
type
=
"mul"
,
type
=
"mul"
,
...
@@ -174,7 +174,7 @@ class SimpleRNN(fluid.imperative.Layer):
...
@@ -174,7 +174,7 @@ class SimpleRNN(fluid.imperative.Layer):
outs
=
list
()
outs
=
list
()
pre_hiddens
=
list
()
pre_hiddens
=
list
()
init_hidden
=
fluid
.
layers
.
tensor
.
create_parameter
(
init_hidden
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
shape
=
[
1
,
3
],
shape
=
[
1
,
3
],
...
@@ -337,10 +337,10 @@ class TestImperative(unittest.TestCase):
...
@@ -337,10 +337,10 @@ class TestImperative(unittest.TestCase):
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
params
=
mlp
.
parameters
(
True
)
params
=
mlp
.
parameters
(
True
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0
_0
.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0
_0
.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1
_0
.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1
_0
.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
len
(
params
),
4
)
self
.
assertEqual
(
len
(
params
),
4
)
sublayers
=
mlp
.
sublayers
(
True
)
sublayers
=
mlp
.
sublayers
(
True
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
2c4fcaa6
...
@@ -78,7 +78,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
...
@@ -78,7 +78,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
class
MNIST
(
fluid
.
imperative
.
Layer
):
class
MNIST
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
name_scope
,
param_attr
=
None
,
bias_attr
=
None
):
def
__init__
(
self
,
name_scope
):
super
(
MNIST
,
self
).
__init__
(
name_scope
)
super
(
MNIST
,
self
).
__init__
(
name_scope
)
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
self
.
_simple_img_conv_pool_1
=
SimpleImgConvPool
(
...
...
python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
浏览文件 @
2c4fcaa6
...
@@ -41,19 +41,17 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
...
@@ -41,19 +41,17 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self
.
_dropout
=
dropout
self
.
_dropout
=
dropout
self
.
_input
=
None
self
.
_input
=
None
self
.
_num_steps
=
num_steps
self
.
_num_steps
=
num_steps
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
cell_array
=
[]
self
.
_helper
=
LayerHelper
(
'SimpleLSTMRNN'
,
act
=
"tanh"
)
self
.
hidden_array
=
[]
def
_build_once
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
def
_build_once
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
self
.
weight_1_arr
=
[]
self
.
weight_1_arr
=
[]
self
.
weight_2_arr
=
[]
self
.
weight_2_arr
=
[]
self
.
bias_arr
=
[]
self
.
bias_arr
=
[]
self
.
hidden_array
=
[]
self
.
cell_array
=
[]
self
.
mask_array
=
[]
self
.
mask_array
=
[]
for
i
in
range
(
self
.
_num_layers
):
for
i
in
range
(
self
.
_num_layers
):
weight_1
=
self
.
_helper
.
create_parameter
(
weight_1
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
...
@@ -62,7 +60,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
...
@@ -62,7 +60,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
))
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
))
self
.
weight_1_arr
.
append
(
weight_1
)
self
.
weight_1_arr
.
append
(
weight_1
)
bias_1
=
self
.
_helper
.
create_parameter
(
bias_1
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(
attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
low
=-
self
.
_init_scale
,
high
=
self
.
_init_scale
)),
...
@@ -71,6 +69,11 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
...
@@ -71,6 +69,11 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
default_initializer
=
fluid
.
initializer
.
Constant
(
0.0
))
default_initializer
=
fluid
.
initializer
.
Constant
(
0.0
))
self
.
bias_arr
.
append
(
bias_1
)
self
.
bias_arr
.
append
(
bias_1
)
def
forward
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
self
.
cell_array
=
[]
self
.
hidden_array
=
[]
for
i
in
range
(
self
.
_num_layers
):
pre_hidden
=
fluid
.
layers
.
slice
(
pre_hidden
=
fluid
.
layers
.
slice
(
init_hidden
,
axes
=
[
0
],
starts
=
[
i
],
ends
=
[
i
+
1
])
init_hidden
,
axes
=
[
0
],
starts
=
[
i
],
ends
=
[
i
+
1
])
pre_cell
=
fluid
.
layers
.
slice
(
pre_cell
=
fluid
.
layers
.
slice
(
...
@@ -82,7 +85,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
...
@@ -82,7 +85,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self
.
hidden_array
.
append
(
pre_hidden
)
self
.
hidden_array
.
append
(
pre_hidden
)
self
.
cell_array
.
append
(
pre_cell
)
self
.
cell_array
.
append
(
pre_cell
)
def
forward
(
self
,
input_embedding
,
init_hidden
=
None
,
init_cell
=
None
):
res
=
[]
res
=
[]
for
index
in
range
(
self
.
_num_steps
):
for
index
in
range
(
self
.
_num_steps
):
self
.
_input
=
fluid
.
layers
.
slice
(
self
.
_input
=
fluid
.
layers
.
slice
(
...
@@ -145,8 +147,6 @@ class PtbModel(fluid.imperative.Layer):
...
@@ -145,8 +147,6 @@ class PtbModel(fluid.imperative.Layer):
self
.
num_layers
=
num_layers
self
.
num_layers
=
num_layers
self
.
num_steps
=
num_steps
self
.
num_steps
=
num_steps
self
.
dropout
=
dropout
self
.
dropout
=
dropout
from
paddle.fluid.layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'PtbModel'
,
act
=
"tanh"
)
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
simple_lstm_rnn
=
SimpleLSTMRNN
(
self
.
full_name
(),
self
.
full_name
(),
hidden_size
,
hidden_size
,
...
@@ -163,13 +163,13 @@ class PtbModel(fluid.imperative.Layer):
...
@@ -163,13 +163,13 @@ class PtbModel(fluid.imperative.Layer):
name
=
'embedding_para'
,
name
=
'embedding_para'
,
initializer
=
fluid
.
initializer
.
UniformInitializer
(
initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
init_scale
,
high
=
init_scale
)))
low
=-
init_scale
,
high
=
init_scale
)))
self
.
softmax_weight
=
self
.
_helper
.
create_parameter
(
self
.
softmax_weight
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(),
attr
=
fluid
.
ParamAttr
(),
shape
=
[
self
.
hidden_size
,
self
.
vocab_size
],
shape
=
[
self
.
hidden_size
,
self
.
vocab_size
],
dtype
=
"float32"
,
dtype
=
"float32"
,
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
default_initializer
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
self
.
init_scale
,
high
=
self
.
init_scale
))
low
=-
self
.
init_scale
,
high
=
self
.
init_scale
))
self
.
softmax_bias
=
self
.
_helper
.
create_parameter
(
self
.
softmax_bias
=
self
.
create_parameter
(
attr
=
fluid
.
ParamAttr
(),
attr
=
fluid
.
ParamAttr
(),
shape
=
[
self
.
vocab_size
],
shape
=
[
self
.
vocab_size
],
dtype
=
"float32"
,
dtype
=
"float32"
,
...
@@ -180,7 +180,6 @@ class PtbModel(fluid.imperative.Layer):
...
@@ -180,7 +180,6 @@ class PtbModel(fluid.imperative.Layer):
pass
pass
def
forward
(
self
,
input
,
label
,
init_hidden
,
init_cell
):
def
forward
(
self
,
input
,
label
,
init_hidden
,
init_cell
):
init_h
=
fluid
.
layers
.
reshape
(
init_h
=
fluid
.
layers
.
reshape
(
init_hidden
,
shape
=
[
self
.
num_layers
,
-
1
,
self
.
hidden_size
])
init_hidden
,
shape
=
[
self
.
num_layers
,
-
1
,
self
.
hidden_size
])
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录