Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b98b7440
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b98b7440
编写于
8月 29, 2018
作者:
C
Chen Weihang
提交者:
GitHub
8月 29, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into sequence_enumerate_op
上级
0c4697f8
902f19b4
变更
145
显示空白变更内容
内联
并排
Showing
145 changed file
with
8109 addition
and
1325 deletion
+8109
-1325
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+2
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+2
-2
paddle/fluid/framework/data_type.cc
paddle/fluid/framework/data_type.cc
+1
-0
paddle/fluid/framework/data_type.h
paddle/fluid/framework/data_type.h
+3
-0
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+41
-24
paddle/fluid/framework/framework.proto
paddle/fluid/framework/framework.proto
+1
-0
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+9
-5
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+273
-0
paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+14
-7
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+6
-8
paddle/fluid/framework/ir/fc_fuse_pass.h
paddle/fluid/framework/ir/fc_fuse_pass.h
+1
-1
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+126
-0
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+33
-0
paddle/fluid/framework/ir/fuse_pass_base.h
paddle/fluid/framework/ir/fuse_pass_base.h
+44
-0
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+0
-57
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+15
-4
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+1
-1
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+61
-16
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+84
-15
paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+5
-5
paddle/fluid/framework/ir/graph_to_program_pass.cc
paddle/fluid/framework/ir/graph_to_program_pass.cc
+65
-0
paddle/fluid/framework/ir/graph_to_program_pass.h
paddle/fluid/framework/ir/graph_to_program_pass.h
+30
-0
paddle/fluid/framework/ir/graph_to_program_pass_test.cc
paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+110
-0
paddle/fluid/framework/ir/graph_viz_pass.cc
paddle/fluid/framework/ir/graph_viz_pass.cc
+54
-28
paddle/fluid/framework/ir/graph_viz_pass.h
paddle/fluid/framework/ir/graph_viz_pass.h
+9
-0
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+15
-6
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+256
-0
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+33
-0
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+9
-2
paddle/fluid/framework/op_desc.h
paddle/fluid/framework/op_desc.h
+1
-5
paddle/fluid/framework/program_desc.cc
paddle/fluid/framework/program_desc.cc
+20
-2
paddle/fluid/framework/program_desc.h
paddle/fluid/framework/program_desc.h
+2
-0
paddle/fluid/framework/tensor.cc
paddle/fluid/framework/tensor.cc
+5
-1
paddle/fluid/framework/var_type.h
paddle/fluid/framework/var_type.h
+1
-1
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+16
-8
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+14
-1
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+24
-47
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+41
-0
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
...fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+5
-0
paddle/fluid/inference/analysis/dot.h
paddle/fluid/inference/analysis/dot.h
+12
-6
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+44
-0
paddle/fluid/inference/analysis/fluid_to_ir_pass.h
paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+42
-12
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+6
-1
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+8
-4
paddle/fluid/inference/analysis/ir_pass_manager.h
paddle/fluid/inference/analysis/ir_pass_manager.h
+5
-3
paddle/fluid/inference/analysis/pass_manager.cc
paddle/fluid/inference/analysis/pass_manager.cc
+2
-2
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+3
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+165
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+2
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+1
-0
paddle/fluid/inference/api/helper.cc
paddle/fluid/inference/api/helper.cc
+44
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+9
-20
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+1
-0
paddle/fluid/inference/io.cc
paddle/fluid/inference/io.cc
+16
-0
paddle/fluid/inference/io.h
paddle/fluid/inference/io.h
+5
-0
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+4
-4
paddle/fluid/inference/tensorrt/convert/concat_op.cc
paddle/fluid/inference/tensorrt/convert/concat_op.cc
+57
-0
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+8
-0
paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+49
-0
paddle/fluid/inference/tests/test_helper.h
paddle/fluid/inference/tests/test_helper.h
+14
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+2
-0
paddle/fluid/operators/attention_lstm_op.cc
paddle/fluid/operators/attention_lstm_op.cc
+1
-1
paddle/fluid/operators/auc_op.h
paddle/fluid/operators/auc_op.h
+0
-14
paddle/fluid/operators/batch_norm_mkldnn_op.cc
paddle/fluid/operators/batch_norm_mkldnn_op.cc
+277
-119
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+2
-2
paddle/fluid/operators/detection/generate_proposals_op.cc
paddle/fluid/operators/detection/generate_proposals_op.cc
+485
-0
paddle/fluid/operators/fake_dequantize_op.cc
paddle/fluid/operators/fake_dequantize_op.cc
+25
-12
paddle/fluid/operators/fake_dequantize_op.cu
paddle/fluid/operators/fake_dequantize_op.cu
+36
-0
paddle/fluid/operators/fake_dequantize_op.h
paddle/fluid/operators/fake_dequantize_op.h
+15
-8
paddle/fluid/operators/fetch_barrier_op.cc
paddle/fluid/operators/fetch_barrier_op.cc
+2
-0
paddle/fluid/operators/fusion_gru_op.cc
paddle/fluid/operators/fusion_gru_op.cc
+332
-0
paddle/fluid/operators/fusion_gru_op.h
paddle/fluid/operators/fusion_gru_op.h
+41
-0
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+149
-7
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+206
-0
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+42
-0
paddle/fluid/operators/math/concat.cu
paddle/fluid/operators/math/concat.cu
+6
-0
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+1
-0
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+2
-1
paddle/fluid/operators/math/math_function.cu
paddle/fluid/operators/math/math_function.cu
+5
-4
paddle/fluid/operators/math/padding.h
paddle/fluid/operators/math/padding.h
+124
-0
paddle/fluid/operators/math/sequence2batch.cc
paddle/fluid/operators/math/sequence2batch.cc
+8
-7
paddle/fluid/operators/math/sequence_padding.cc
paddle/fluid/operators/math/sequence_padding.cc
+97
-103
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+103
-144
paddle/fluid/operators/math/sequence_padding.h
paddle/fluid/operators/math/sequence_padding.h
+38
-14
paddle/fluid/operators/math/sequence_padding_test.cc
paddle/fluid/operators/math/sequence_padding_test.cc
+19
-4
paddle/fluid/operators/pad_constant_like_op.cc
paddle/fluid/operators/pad_constant_like_op.cc
+212
-0
paddle/fluid/operators/pad_constant_like_op.cu
paddle/fluid/operators/pad_constant_like_op.cu
+27
-0
paddle/fluid/operators/pad_constant_like_op.h
paddle/fluid/operators/pad_constant_like_op.h
+93
-0
paddle/fluid/operators/pad_op.h
paddle/fluid/operators/pad_op.h
+20
-93
paddle/fluid/operators/parallel_do_op.cc
paddle/fluid/operators/parallel_do_op.cc
+1
-0
paddle/fluid/operators/print_op.cc
paddle/fluid/operators/print_op.cc
+33
-71
paddle/fluid/operators/scale_op.cc
paddle/fluid/operators/scale_op.cc
+20
-1
paddle/fluid/operators/scale_op.h
paddle/fluid/operators/scale_op.h
+20
-8
paddle/fluid/operators/send_barrier_op.cc
paddle/fluid/operators/send_barrier_op.cc
+4
-0
paddle/fluid/operators/sequence_pad_op.cc
paddle/fluid/operators/sequence_pad_op.cc
+194
-0
paddle/fluid/operators/sequence_pad_op.cu
paddle/fluid/operators/sequence_pad_op.cu
+29
-0
paddle/fluid/operators/sequence_pad_op.h
paddle/fluid/operators/sequence_pad_op.h
+66
-0
paddle/fluid/operators/unstack_op.cc
paddle/fluid/operators/unstack_op.cc
+26
-0
paddle/fluid/operators/unstack_op.h
paddle/fluid/operators/unstack_op.h
+135
-0
paddle/fluid/operators/warpctc_op.h
paddle/fluid/operators/warpctc_op.h
+18
-6
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+5
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+17
-4
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+9
-1
paddle/fluid/platform/dynload/CMakeLists.txt
paddle/fluid/platform/dynload/CMakeLists.txt
+2
-0
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+1
-2
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+27
-3
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+0
-6
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+10
-0
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+3
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+1
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+11
-4
python/paddle/dataset/image.py
python/paddle/dataset/image.py
+1
-2
python/paddle/dataset/movielens.py
python/paddle/dataset/movielens.py
+3
-2
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+2
-0
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+1
-4
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+71
-0
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+9
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+191
-22
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+7
-1
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
...ry_optimization/test_memopt_image_classification_train.py
+2
-2
python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
...ok_memory_optimization/test_memopt_machine_translation.py
+2
-2
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+39
-0
python/paddle/fluid/tests/unittests/dist_se_resnext.py
python/paddle/fluid/tests/unittests/dist_se_resnext.py
+14
-8
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+1641
-168
python/paddle/fluid/tests/unittests/dist_word2vec.py
python/paddle/fluid/tests/unittests/dist_word2vec.py
+10
-6
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+42
-4
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+1
-1
python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
...n/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+21
-12
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+133
-0
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+36
-26
python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
...uid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
+139
-0
python/paddle/fluid/tests/unittests/test_generate_proposals.py
...n/paddle/fluid/tests/unittests/test_generate_proposals.py
+320
-0
python/paddle/fluid/tests/unittests/test_gru_op.py
python/paddle/fluid/tests/unittests/test_gru_op.py
+104
-103
python/paddle/fluid/tests/unittests/test_pad_constant_like.py
...on/paddle/fluid/tests/unittests/test_pad_constant_like.py
+69
-0
python/paddle/fluid/tests/unittests/test_print_op.py
python/paddle/fluid/tests/unittests/test_print_op.py
+2
-3
python/paddle/fluid/tests/unittests/test_scale_op.py
python/paddle/fluid/tests/unittests/test_scale_op.py
+54
-0
python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+131
-0
python/paddle/fluid/tests/unittests/test_tensor.py
python/paddle/fluid/tests/unittests/test_tensor.py
+21
-0
python/paddle/fluid/tests/unittests/test_unstack_op.py
python/paddle/fluid/tests/unittests/test_unstack_op.py
+81
-0
python/paddle/fluid/tests/unittests/test_variable.py
python/paddle/fluid/tests/unittests/test_variable.py
+2
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+21
-10
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
b98b7440
...
@@ -52,9 +52,8 @@ ExternalProject_Add(
...
@@ -52,9 +52,8 @@ ExternalProject_Add(
extern_anakin
extern_anakin
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLML_PROJECT
}
DEPENDS
${
MKLML_PROJECT
}
# Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
GIT_REPOSITORY
"https://github.com/PaddlePaddle/Anakin"
GIT_REPOSITORY
"https://github.com/luotao1/Anakin"
GIT_TAG
"9424277cf9ae180a14aff09560d3cd60a49c76d2"
GIT_TAG
"211d1fc5d813d70c0c14072f9083cf25f40940ea"
PREFIX
${
ANAKIN_SOURCE_DIR
}
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
CMAKE_ARGS -DUSE_GPU_PLACE=YES
...
...
paddle/fluid/API.spec
浏览文件 @
b98b7440
...
@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
...
@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
...
@@ -146,6 +147,7 @@ paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', '
...
@@ -146,6 +147,7 @@ paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', '
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
...
@@ -165,6 +167,7 @@ paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, ke
...
@@ -165,6 +167,7 @@ paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, ke
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...
@@ -297,6 +300,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
...
@@ -297,6 +300,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...
@@ -379,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a
...
@@ -379,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a
paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[
float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[u
int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[
int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[
int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -107,11 +107,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
...
@@ -107,11 +107,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr
graph_to_program_pass
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method
graph_to_program_pass
)
endif
()
endif
()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/data_type.cc
浏览文件 @
b98b7440
...
@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
...
@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
RegType
(
size_t
,
proto
::
VarType
::
SIZE_T
);
RegType
(
size_t
,
proto
::
VarType
::
SIZE_T
);
RegType
(
int16_t
,
proto
::
VarType
::
INT16
);
RegType
(
int16_t
,
proto
::
VarType
::
INT16
);
RegType
(
uint8_t
,
proto
::
VarType
::
UINT8
);
RegType
(
uint8_t
,
proto
::
VarType
::
UINT8
);
RegType
(
int8_t
,
proto
::
VarType
::
INT8
);
#undef RegType
#undef RegType
return
retv
;
return
retv
;
...
...
paddle/fluid/framework/data_type.h
浏览文件 @
b98b7440
...
@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
...
@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
case
proto
::
VarType
::
INT16
:
case
proto
::
VarType
::
INT16
:
visitor
.
template
operator
()
<
int16_t
>();
visitor
.
template
operator
()
<
int16_t
>();
break
;
break
;
case
proto
::
VarType
::
INT8
:
visitor
.
template
operator
()
<
int8_t
>();
break
;
default:
default:
PADDLE_THROW
(
"Not supported %d"
,
type
);
PADDLE_THROW
(
"Not supported %d"
,
type
);
}
}
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
b98b7440
...
@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
...
@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node
->
Op
()
->
Type
());
node
->
Op
()
->
Type
());
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
}
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"fetch_barrier"
);
void
SetOpInputsAllPlaces
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
num_places
)
{
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
();
for
(
ir
::
Node
*
input
:
node
->
inputs
)
{
VarHandle
*
var
=
nullptr
;
for
(
int
place_offset
=
0
;
place_offset
<
num_places
;
++
place_offset
)
{
auto
&
var_holders
=
result
->
Get
<
GraphVars
>
(
kGraphVars
)[
place_offset
];
auto
&
var_holder
=
var_holders
[
input
->
Name
()];
if
(
!
var_holder
.
empty
())
{
var
=
var_holder
.
rbegin
()
->
get
();
op_handle
->
AddInput
(
var
);
}
}
}
}
}
}
// Create RPC related op handles that connects its in ops and out ops.
// Create RPC related op handles that connects its in ops and out ops.
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
ir
::
Node
*
node
)
const
{
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int
op_dev_id
=
-
1
;
int
op_dev_id
=
-
1
;
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// TODO(paddle-dev): getting the first var is not safe.
// TODO(paddle-dev): getting the first var is not safe.
...
@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
...
@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
}
}
auto
recv_param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
auto
recv_param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if
(
recv_param_grad
.
size
()
==
2U
)
{
if
(
recv_param_grad
.
size
()
==
2U
)
{
op_dev_id
=
GetVarDeviceID
(
*
result
,
recv_param_grad
[
1
]);
op_dev_id
=
GetVarDeviceID
(
*
result
,
recv_param_grad
[
1
]);
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
...
@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
...
@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
.
emplace
(
varname
,
op_dev_id
);
.
emplace
(
varname
,
op_dev_id
);
}
}
}
else
{
}
else
{
// send_barrier
and fetch_barrier op can be scheduled on device 0
// send_barrier
, fetch_barrier will run on place 0;
op_dev_id
=
0
;
op_dev_id
=
0
;
}
}
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"can not find the right place for rpc op: %s"
,
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"can not find the right place for rpc op: %s"
,
node
->
Op
()
->
Type
());
node
->
Op
()
->
Type
());
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
RPCOpHandle
(
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
RPCOpHandle
(
result
->
CreateOpNode
(
node
->
Op
()),
*
node
->
Op
(),
local_scopes_
[
op_dev_id
],
result
->
CreateOpNode
(
node
->
Op
()),
*
node
->
Op
(),
local_scopes_
[
op_dev_id
],
node
->
Op
()
->
Type
(),
places_
[
op_dev_id
]));
node
->
Op
()
->
Type
(),
places_
[
op_dev_id
]));
// TODO(panyx0718): This might not be needed anymore.
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
if
(
node
->
Op
()
->
Type
()
==
"send_barrier"
)
{
CreateOpHandleIOs
(
result
,
node
,
op_dev_id
);
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"send"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"recv"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"send_barrier"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
ConnectOp
(
result
,
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
(),
"recv"
);
}
else
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// do nothing
}
else
{
}
else
{
PADDLE_THROW
(
// send_barrier, recv, fetch_barrier's inputs are deps var, get them from
"rpc op should be in ["
// all places
"send, send_barrier. recv, fetch_barrier]"
);
auto
p
=
places_
[
op_dev_id
];
}
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
().
get
();
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
CreateOpHandleIOs
(
result
,
node
,
op_dev_id
);
SetOpInputsAllPlaces
(
result
,
node
,
places_
.
size
());
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
int
outvar_dev_id
=
op_dev_id
;
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
outvar_dev_id
=
GetVarDeviceID
(
*
result
,
output
->
Name
());
PADDLE_ENFORCE_NE
(
outvar_dev_id
,
-
1
);
}
p
=
places_
[
outvar_dev_id
];
ir
::
Node
*
new_node
=
nullptr
;
if
(
output
->
Var
())
{
new_node
=
result
->
CreateVarNode
(
output
->
Var
());
}
else
{
new_node
=
result
->
CreateEmptyNode
(
output
->
Name
(),
ir
::
Node
::
Type
::
kVariable
);
}
CreateOpOutput
(
result
,
op_handle
,
new_node
,
p
,
outvar_dev_id
);
}
}
}
}
bool
MultiDevSSAGraphBuilder
::
IsScaleLossOp
(
ir
::
Node
*
node
)
const
{
bool
MultiDevSSAGraphBuilder
::
IsScaleLossOp
(
ir
::
Node
*
node
)
const
{
...
...
paddle/fluid/framework/framework.proto
浏览文件 @
b98b7440
...
@@ -107,6 +107,7 @@ message VarType {
...
@@ -107,6 +107,7 @@ message VarType {
// Tensor<size_t> is used in C++.
// Tensor<size_t> is used in C++.
SIZE_T
=
19
;
SIZE_T
=
19
;
UINT8
=
20
;
UINT8
=
20
;
INT8
=
21
;
// Other types that may need additional descriptions
// Other types that may need additional descriptions
LOD_TENSOR
=
7
;
LOD_TENSOR
=
7
;
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -3,14 +3,18 @@ cc_library(graph SRCS graph.cc DEPS node)
...
@@ -3,14 +3,18 @@ cc_library(graph SRCS graph.cc DEPS node)
cc_library
(
graph_helper SRCS graph_helper.cc DEPS graph
)
cc_library
(
graph_helper SRCS graph_helper.cc DEPS graph
)
cc_library
(
pass SRCS pass.cc DEPS graph node graph_helper
)
cc_library
(
pass SRCS pass.cc DEPS graph node graph_helper
)
cc_library
(
graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper
)
cc_library
(
graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper
)
cc_library
(
graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper
)
cc_library
(
graph_traits SRCS graph_traits.cc DEPS graph
)
cc_library
(
graph_traits SRCS graph_traits.cc DEPS graph
)
cc_library
(
graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits
)
cc_library
(
graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits
)
cc_library
(
fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter
)
cc_library
(
fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector
)
cc_library
(
attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector
)
cc_library
(
infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass
)
cc_library
(
infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass
)
cc_library
(
fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector
)
cc_library
(
seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector
)
cc_test
(
pass_test SRCS pass_test.cc DEPS graph pass graph_helper
)
cc_test
(
pass_test SRCS pass_test.cc DEPS graph pass graph_helper
)
cc_test
(
graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter
)
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto
)
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
struct
Param
{
std
::
string
X
=
"concat_0.tmp_0"
;
std
::
string
C0
=
"cell_init"
;
std
::
string
H0
=
"hidden_init"
;
std
::
string
AttentionWeight
=
"attention_fc.w_0"
;
std
::
string
AttentionBias
=
"attention_fc.b_0"
;
std
::
string
AttentionScalar
=
"attention_output.w_0"
;
std
::
string
AttentionScalarBias
=
"attention_output.b_0"
;
std
::
string
LSTMWeight
=
"attention_w.new"
;
std
::
string
LSTMBias
=
"attention_b.new"
;
std
::
string
Hidden
=
"array_to_lod_tensor_0.tmp_0"
;
std
::
string
Cell
=
"at.cell.new"
;
std
::
string
AttentionedX
=
"at.x.new"
;
std
::
string
AttentionFCOut
=
"at.fc.new"
;
std
::
string
LSTMX
=
"at.lstmx.new"
;
std
::
string
LSTMOUT
=
"at.lstmout.new"
;
};
void
PrepareParameters
(
Graph
*
graph
,
const
Param
&
param
);
void
FindWhileOp
(
Graph
*
graph
)
{
GraphPatternDetector
gpd
;
std
::
unordered_set
<
int
>
fused_external_ops
(
{
35
,
36
,
37
,
38
,
43
,
44
,
49
,
45
,
46
,
47
,
41
,
42
,
53
,
54
,
48
,
57
,
55
,
56
,
52
,
74
,
80
,
77
,
78
,
79
,
50
,
77
,
39
,
40
,
51
});
gpd
.
mutable_pattern
()
->
NewNode
(
[
&
](
Node
*
n
)
{
return
fused_external_ops
.
count
(
n
->
id
());
},
"while"
);
if
(
!
graph
->
Has
(
kGraphvizMarkedNodeAttr
))
{
graph
->
Set
(
kGraphvizMarkedNodeAttr
,
new
GraphVizPass
::
marked_nodes_t
);
}
auto
&
marked_nodes
=
graph
->
Get
<
GraphVizPass
::
marked_nodes_t
>
(
kGraphvizMarkedNodeAttr
);
auto
handle
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
auto
*
while_pat_node
=
gpd
.
pattern
().
RetriveNode
(
"while"
);
auto
*
while_node
=
subgraph
.
at
(
while_pat_node
);
marked_nodes
.
insert
(
while_node
);
};
gpd
(
graph
,
handle
);
Param
param
;
// Add AttentionLSTM node
OpDesc
op_desc
;
op_desc
.
SetType
(
"attention_lstm"
);
#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x});
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x});
OP_SET_IN
(
X
);
OP_SET_IN
(
C0
);
OP_SET_IN
(
H0
);
OP_SET_IN
(
AttentionWeight
);
OP_SET_IN
(
AttentionBias
);
OP_SET_IN
(
AttentionScalar
);
OP_SET_IN
(
AttentionScalarBias
);
OP_SET_IN
(
LSTMWeight
);
OP_SET_IN
(
LSTMBias
);
OP_SET_OUT
(
Hidden
);
OP_SET_OUT
(
Cell
);
OP_SET_OUT
(
AttentionedX
);
OP_SET_OUT
(
AttentionFCOut
);
OP_SET_OUT
(
LSTMX
);
OP_SET_OUT
(
LSTMOUT
);
#undef OP_SET_IN
#undef OP_SET_OUT
auto
*
X
=
graph
->
RetriveNode
(
34
);
auto
*
LSTMOUT
=
graph
->
RetriveNode
(
81
);
auto
*
cell_init
=
graph
->
RetriveNode
(
6
);
auto
*
hidden_init
=
graph
->
RetriveNode
(
8
);
#define LINK_TO(node0, node1) \
node0->outputs.push_back(node1); \
node1->inputs.push_back(node0);
auto
*
lstm_op
=
graph
->
CreateOpNode
(
&
op_desc
);
PrepareParameters
(
graph
,
param
);
LINK_TO
(
X
,
lstm_op
);
LINK_TO
(
cell_init
,
lstm_op
);
LINK_TO
(
hidden_init
,
lstm_op
);
LINK_TO
(
lstm_op
,
LSTMOUT
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
}
#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x);
#define CHECK_P2(x0, x1) \
CHECK_P1(x0); \
CHECK_P1(x1);
#define CHECK_P3(x0, x1, x2) \
CHECK_P2(x0, x1); \
CHECK_P1(x2);
#define CHECK_P4(x0, x1, x2, x3) \
CHECK_P3(x0, x1, x2); \
CHECK_P1(x3);
#define CHECK_P5(x0, x1, x2, x3, x4) \
CHECK_P4(x0, x1, x2, x3); \
CHECK_P1(x4);
void
PrepareLSTMWeight
(
const
LoDTensor
&
W_forget_w0
,
const
LoDTensor
&
W_forget_w1
,
const
LoDTensor
&
W_input_w0
,
const
LoDTensor
&
W_input_w1
,
const
LoDTensor
&
W_output_w0
,
const
LoDTensor
&
W_output_w1
,
const
LoDTensor
&
W_cell_w0
,
const
LoDTensor
&
W_cell_w1
,
LoDTensor
*
out
);
void
PrepareLSTMBias
(
const
LoDTensor
&
B_forget
,
const
LoDTensor
&
B_input
,
const
LoDTensor
&
B_output
,
const
LoDTensor
&
B_cell
,
LoDTensor
*
out
);
void
PrepareParameters
(
Graph
*
graph
,
const
Param
&
param
)
{
// Check parameters
PADDLE_ENFORCE
(
graph
->
Has
(
kParamScopeAttr
));
auto
*
scope
=
graph
->
Get
<
Scope
*>
(
kParamScopeAttr
);
// Create new parameters.
scope
->
Var
(
param
.
LSTMWeight
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
LSTMBias
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
Hidden
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
Cell
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
AttentionedX
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
AttentionFCOut
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
LSTMX
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
LSTMOUT
)
->
GetMutable
<
LoDTensor
>
();
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W
(
forget
);
GATE_W
(
input
);
GATE_W
(
output
);
GATE_W
(
c
);
#undef GATE_W
auto
*
attention_fc_w
=
scope
->
FindVar
(
"attention_fc.w_0"
);
auto
*
attention_fc_b
=
scope
->
FindVar
(
"attention_fc.b_0"
);
auto
*
attention_output_w
=
scope
->
FindVar
(
"attention_output.w_0"
);
auto
*
attention_output_b
=
scope
->
FindVar
(
"attention_output.b_0"
);
CHECK_P4
(
attention_fc_w
,
attention_fc_b
,
attention_output_w
,
attention_output_b
);
auto
*
lstm_weight
=
scope
->
Var
(
param
.
LSTMWeight
);
auto
*
lstm_weight_t
=
lstm_weight
->
GetMutable
<
LoDTensor
>
();
auto
*
lstm_bias
=
scope
->
Var
(
param
.
LSTMBias
);
auto
*
lstm_bias_t
=
lstm_bias
->
GetMutable
<
LoDTensor
>
();
// reshape attention_bias
auto
*
attention_bias_t
=
scope
->
FindVar
(
param
.
AttentionBias
)
->
GetMutable
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
attention_bias_t
->
dims
().
size
(),
1
);
attention_bias_t
->
Resize
(
make_ddim
({
1
,
attention_bias_t
->
dims
()[
0
]}));
auto
*
attention_scalar_bias_t
=
scope
->
FindVar
(
param
.
AttentionScalarBias
)
->
GetMutable
<
LoDTensor
>
();
attention_scalar_bias_t
->
Resize
(
make_ddim
({
1
,
attention_scalar_bias_t
->
dims
()[
0
]}));
PrepareLSTMWeight
(
W_forget_w0_t
,
W_forget_w1_t
,
W_input_w0_t
,
W_input_w1_t
,
W_output_w0_t
,
W_output_w1_t
,
W_c_w0_t
,
W_c_w1_t
,
lstm_weight_t
);
PrepareLSTMBias
(
W_forget_b0_t
,
W_input_b0_t
,
W_output_b0_t
,
W_c_b0_t
,
lstm_bias_t
);
}
// Prepare parameters
void
PrepareLSTMWeight
(
const
LoDTensor
&
W_forget_w0
,
const
LoDTensor
&
W_forget_w1
,
const
LoDTensor
&
W_input_w0
,
const
LoDTensor
&
W_input_w1
,
const
LoDTensor
&
W_output_w0
,
const
LoDTensor
&
W_output_w1
,
const
LoDTensor
&
W_cell_w0
,
const
LoDTensor
&
W_cell_w1
,
LoDTensor
*
out
)
{
int
D
=
W_forget_w0
.
dims
()[
0
];
int
M
=
W_forget_w1
.
dims
()[
0
];
out
->
Resize
(
make_ddim
({
D
+
M
,
4
*
D
}));
VLOG
(
3
)
<<
"LSTMWeight resized to "
<<
out
->
dims
();
float
*
out_data
=
out
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
std
::
array
<
const
float
*
,
4
>
tensors
(
{
W_forget_w0
.
data
<
float
>
(),
W_input_w0
.
data
<
float
>
(),
W_output_w0
.
data
<
float
>
(),
W_cell_w0
.
data
<
float
>
()});
std
::
array
<
const
float
*
,
4
>
tensors1
(
{
W_forget_w1
.
data
<
float
>
(),
W_input_w1
.
data
<
float
>
(),
W_output_w1
.
data
<
float
>
(),
W_cell_w1
.
data
<
float
>
()});
for
(
int
row
=
0
;
row
<
D
;
row
++
)
{
for
(
int
col
=
0
;
col
<
4
;
col
++
)
{
float
*
dst
=
out_data
+
4
*
D
*
row
+
D
*
col
;
const
float
*
src
=
tensors
[
col
]
+
D
*
row
;
memcpy
(
dst
,
src
,
D
*
sizeof
(
float
));
}
}
for
(
int
row
=
0
;
row
<
M
;
row
++
)
{
for
(
int
col
=
0
;
col
<
4
;
col
++
)
{
float
*
dst
=
out_data
+
4
*
D
*
(
D
+
row
)
+
D
*
col
;
const
float
*
src
=
tensors1
[
col
]
+
D
*
row
;
memcpy
(
dst
,
src
,
D
*
sizeof
(
float
));
}
}
}
void
PrepareLSTMBias
(
const
LoDTensor
&
B_forget
,
const
LoDTensor
&
B_input
,
const
LoDTensor
&
B_output
,
const
LoDTensor
&
B_cell
,
LoDTensor
*
out
)
{
std
::
array
<
const
float
*
,
4
>
tensors
(
{
B_forget
.
data
<
float
>
(),
B_input
.
data
<
float
>
(),
B_output
.
data
<
float
>
(),
B_cell
.
data
<
float
>
()});
PADDLE_ENFORCE_EQ
(
B_forget
.
dims
().
size
(),
1
);
int
D
=
B_forget
.
dims
()[
0
];
out
->
Resize
(
make_ddim
({
1
,
4
*
D
}));
auto
*
out_data
=
out
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
memcpy
(
out_data
+
D
*
i
,
tensors
[
i
],
D
*
sizeof
(
float
));
}
}
// Parameters
std
::
unique_ptr
<
ir
::
Graph
>
AttentionLSTMFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PDPattern
external_pattern
,
subblock_pattern
;
FindWhileOp
(
graph
.
get
());
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
attention_lstm_fuse_pass
,
paddle
::
framework
::
ir
::
AttentionLSTMFusePass
);
paddle/fluid/
inference/analysis/dot.cc
→
paddle/fluid/
framework/ir/attention_lstm_fuse_pass.h
浏览文件 @
b98b7440
...
@@ -12,12 +12,19 @@
...
@@ -12,12 +12,19 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/inference/analysis/dot.h"
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
framework
{
namespace
analysis
{
namespace
ir
{
size_t
Dot
::
counter
=
0
;
}
// namespace analysis
class
AttentionLSTMFusePass
:
public
FusePassBase
{
}
// namespace inference
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
b98b7440
...
@@ -100,12 +100,10 @@ void BuildFCPattern(PDPattern* pattern) {
...
@@ -100,12 +100,10 @@ void BuildFCPattern(PDPattern* pattern) {
},
},
"elementwise_add_out"
);
"elementwise_add_out"
);
pattern
->
AddEdge
(
mul_parameter_var
,
mul_op
);
mul_op
->
LinksFrom
({
mul_parameter_var
,
mul_tmp_input_var
})
pattern
->
AddEdge
(
mul_tmp_input_var
,
mul_op
);
.
LinksTo
({
mul_out_var
});
pattern
->
AddEdge
(
mul_op
,
mul_out_var
);
elementwise_add_op
->
LinksFrom
({
mul_out_var
,
elementwise_add_tmp_var
})
pattern
->
AddEdge
(
mul_out_var
,
elementwise_add_op
);
.
LinksTo
({
elementwise_add_out_var
});
pattern
->
AddEdge
(
elementwise_add_tmp_var
,
elementwise_add_op
);
pattern
->
AddEdge
(
elementwise_add_op
,
elementwise_add_out_var
);
}
}
// Replace the node `from` in the links to `to`
// Replace the node `from` in the links to `to`
...
@@ -125,7 +123,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
...
@@ -125,7 +123,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std
::
unordered_set
<
Node
*>
nodes2delete
;
std
::
unordered_set
<
Node
*>
nodes2delete
;
GraphPatternDetect
e
r
gpd
;
GraphPatternDetect
o
r
gpd
;
BuildFCPattern
(
gpd
.
mutable_pattern
());
BuildFCPattern
(
gpd
.
mutable_pattern
());
#define GET_NODE(id) \
#define GET_NODE(id) \
...
@@ -134,7 +132,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
...
@@ -134,7 +132,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \
auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
auto
handler
=
[
&
](
const
GraphPatternDetect
e
r
::
subgraph_t
&
subgraph
,
auto
handler
=
[
&
](
const
GraphPatternDetect
o
r
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FC fuse"
;
VLOG
(
4
)
<<
"handle FC fuse"
;
// Currently, there is no FC op available, so I will just simulate the
// Currently, there is no FC op available, so I will just simulate the
...
...
paddle/fluid/framework/ir/fc_fuse_pass.h
浏览文件 @
b98b7440
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detect
e
r.h"
#include "paddle/fluid/framework/ir/graph_pattern_detect
o
r.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
std
::
unique_ptr
<
ir
::
Graph
>
FCLstmFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
GraphPatternDetector
gpd
;
auto
*
pattern
=
gpd
.
mutable_pattern
();
std
::
unordered_set
<
int
>
fused_ops
({
// first lstm
13
,
15
,
16
,
// second lstm
23
,
25
,
26
});
pattern
->
NewNode
([
&
](
Node
*
x
)
{
return
fused_ops
.
count
(
x
->
id
());
},
"any_node"
);
std
::
unordered_set
<
Node
*>
marked_nodes
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
auto
*
id
=
subgraph
.
at
(
gpd
.
pattern
().
RetriveNode
(
"any_node"
));
marked_nodes
.
insert
(
id
);
};
gpd
(
graph
.
get
(),
handler
);
// Create New OpDesc
auto
lstm_creator
=
[
&
](
int
lstm
,
int
input
,
int
weight_x
,
int
weight_h
,
int
bias
,
int
hidden
,
int
cell
,
int
xx
)
{
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
GET_NODE
(
input
);
GET_NODE
(
weight_x
);
GET_NODE
(
weight_h
);
GET_NODE
(
bias
);
GET_NODE
(
hidden
);
GET_NODE
(
cell
);
GET_NODE
(
xx
);
GET_NODE
(
lstm
);
OpDesc
op_desc
;
op_desc
.
SetType
(
"fusion_lstm"
);
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
SET_IN
(
X
,
input
);
SET_IN
(
WeightX
,
weight_x
);
SET_IN
(
WeightH
,
weight_h
);
SET_IN
(
Bias
,
bias
);
#undef GET_NODE
#undef SET_IN
LOG
(
INFO
)
<<
"hidden_n: "
<<
hidden_n
->
Name
();
LOG
(
INFO
)
<<
"cell: "
<<
cell_n
->
Name
();
LOG
(
INFO
)
<<
"xx: "
<<
xx_n
->
Name
();
op_desc
.
SetInput
(
"H0"
,
{});
op_desc
.
SetInput
(
"C0"
,
{});
op_desc
.
SetOutput
(
"Hidden"
,
{
hidden_n
->
Name
()});
op_desc
.
SetOutput
(
"Cell"
,
{
cell_n
->
Name
()});
op_desc
.
SetOutput
(
"XX"
,
{
xx_n
->
Name
()});
op_desc
.
SetOutput
(
"BatchedGate"
,
{
"blstm_0.tmp_2"
});
op_desc
.
SetOutput
(
"BatchCellPreAct"
,
{
"blstm_1.tmp_2"
});
op_desc
.
SetAttr
(
"is_reverse"
,
lstm_n
->
Op
()
->
GetAttr
(
"is_reverse"
));
op_desc
.
SetAttr
(
"use_peepholes"
,
false
);
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
#define LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
LINK_TO
(
input_n
,
op
);
LINK_TO
(
weight_x_n
,
op
);
LINK_TO
(
weight_h_n
,
op
);
LINK_TO
(
bias_n
,
op
);
LINK_TO
(
op
,
hidden_n
);
#undef LINK_TO
return
op
;
};
lstm_creator
(
16
,
12
,
14
,
18
,
17
,
22
,
21
,
19
);
lstm_creator
(
26
,
12
,
24
,
28
,
27
,
32
,
31
,
29
);
// remove all the nodes
for
(
auto
*
node
:
marked_nodes
)
{
graph
->
RemoveNode
(
const_cast
<
Node
*>
(
node
));
}
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
it
=
node
->
inputs
.
begin
();
it
!=
node
->
inputs
.
end
();)
{
if
(
marked_nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
inputs
.
erase
(
it
);
}
else
it
++
;
}
for
(
auto
it
=
node
->
outputs
.
begin
();
it
!=
node
->
outputs
.
end
();)
{
if
(
marked_nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
outputs
.
erase
(
it
);
}
else
it
++
;
}
}
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
fc_lstm_fuse_pass
,
paddle
::
framework
::
ir
::
FCLstmFusePass
);
paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
FCLstmFusePass
:
public
Pass
{
public:
virtual
~
FCLstmFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/fuse_pass_base.h
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
static
const
char
kParamScopeAttr
[]
=
"param_scope"
;
class
FusePassBase
:
public
Pass
{
public:
void
Init
(
Graph
*
graph
)
const
{
graph_
=
graph
;
}
Scope
*
param_scope
()
const
{
PADDLE_ENFORCE
(
graph_
->
Has
(
kParamScopeAttr
));
return
graph_
->
Get
<
framework
::
Scope
*>
(
kParamScopeAttr
);
}
virtual
~
FusePassBase
()
{}
protected:
mutable
Graph
*
graph_
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph.cc
浏览文件 @
b98b7440
...
@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
...
@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
}
}
}
}
std
::
vector
<
ir
::
Node
*>
send_ops
;
ir
::
Node
*
send_bar
=
nullptr
;
std
::
vector
<
ir
::
Node
*>
recv_ops
;
ir
::
Node
*
fetch_bar
=
nullptr
;
for
(
ir
::
Node
*
node
:
Nodes
())
{
if
(
node
->
Name
()
==
"send"
)
{
send_ops
.
push_back
(
node
);
}
else
if
(
node
->
Name
()
==
"send_barrier"
)
{
PADDLE_ENFORCE
(
!
send_bar
,
"only has one send barrier"
);
send_bar
=
node
;
}
else
if
(
node
->
Name
()
==
"recv"
)
{
recv_ops
.
push_back
(
node
);
}
else
if
(
node
->
Name
()
==
"fetch_barrier"
)
{
PADDLE_ENFORCE
(
!
fetch_bar
,
"only has one fetch barrier"
);
fetch_bar
=
node
;
}
}
if
(
send_bar
)
{
for
(
ir
::
Node
*
send
:
send_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
send
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
send
);
send_bar
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
send_bar
);
}
for
(
ir
::
Node
*
recv
:
recv_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
recv
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
recv
);
send_bar
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
send_bar
);
}
}
if
(
fetch_bar
)
{
for
(
ir
::
Node
*
recv
:
recv_ops
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
recv
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
recv
);
fetch_bar
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
fetch_bar
);
}
}
std
::
vector
<
std
::
string
>
send_vars
=
FindDistTrainSendVars
(
send_ops
);
std
::
vector
<
std
::
string
>
recv_vars
=
FindDistTrainRecvVars
(
recv_ops
);
for
(
ir
::
Node
*
node
:
Nodes
())
{
if
(
IsDistTrainOp
(
node
,
send_vars
,
recv_vars
))
{
if
(
fetch_bar
&&
node
->
Name
()
==
"concat"
)
{
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
fetch_bar
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
fetch_bar
);
node
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
node
);
}
}
}
/**
/**
* We should handle write after read(WAR) and write after write(WAW) here.
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
* Because some of the operators of the program can be executed parallelly.
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
b98b7440
...
@@ -99,13 +99,13 @@ class Graph {
...
@@ -99,13 +99,13 @@ class Graph {
// Create a normal variable with non-null VarDesc.
// Create a normal variable with non-null VarDesc.
ir
::
Node
*
CreateVarNode
(
VarDesc
*
var_desc
)
{
ir
::
Node
*
CreateVarNode
(
VarDesc
*
var_desc
)
{
PADDLE_ENFORCE
(
var_desc
);
PADDLE_ENFORCE
(
var_desc
);
return
AddNode
(
new
ir
::
Node
(
var_desc
));
return
AddNode
(
new
ir
::
Node
(
var_desc
,
node_count_
++
));
}
}
// Create a normal runnable operator with OpDesc.
// Create a normal runnable operator with OpDesc.
ir
::
Node
*
CreateOpNode
(
OpDesc
*
op_desc
)
{
ir
::
Node
*
CreateOpNode
(
OpDesc
*
op_desc
)
{
PADDLE_ENFORCE
(
op_desc
);
PADDLE_ENFORCE
(
op_desc
);
return
AddNode
(
new
ir
::
Node
(
op_desc
));
return
AddNode
(
new
ir
::
Node
(
op_desc
,
node_count_
++
));
}
}
// Create a control dependency var that connects 2 operations. The
// Create a control dependency var that connects 2 operations. The
...
@@ -115,13 +115,14 @@ class Graph {
...
@@ -115,13 +115,14 @@ class Graph {
// TODO(panyx0718): control var name should be really unique.
// TODO(panyx0718): control var name should be really unique.
const
std
::
string
name
=
string
::
Sprintf
(
const
std
::
string
name
=
string
::
Sprintf
(
"%s@%llu"
,
ir
::
Node
::
kControlDepVarName
,
node_set_
.
size
());
"%s@%llu"
,
ir
::
Node
::
kControlDepVarName
,
node_set_
.
size
());
return
AddNode
(
new
ir
::
Node
(
name
,
ir
::
Node
::
Type
::
kVariable
));
return
AddNode
(
new
ir
::
Node
(
name
,
ir
::
Node
::
Type
::
kVariable
,
node_count_
++
));
}
}
// A more free style way of creating a graph node. Mostly use for test
// A more free style way of creating a graph node. Mostly use for test
// or "copy" from another node. Avoid using it if possible.
// or "copy" from another node. Avoid using it if possible.
ir
::
Node
*
CreateEmptyNode
(
const
std
::
string
&
name
,
ir
::
Node
::
Type
type
)
{
ir
::
Node
*
CreateEmptyNode
(
const
std
::
string
&
name
,
ir
::
Node
::
Type
type
)
{
return
AddNode
(
new
ir
::
Node
(
name
,
type
));
return
AddNode
(
new
ir
::
Node
(
name
,
type
,
node_count_
++
));
}
}
// Clear all node information of the graph and return the ownership of the
// Clear all node information of the graph and return the ownership of the
...
@@ -142,12 +143,20 @@ class Graph {
...
@@ -142,12 +143,20 @@ class Graph {
nodes_
.
erase
(
node
);
nodes_
.
erase
(
node
);
}
}
Node
*
RetriveNode
(
int
id
)
{
auto
it
=
id2node_
.
find
(
id
);
if
(
it
!=
id2node_
.
end
())
return
it
->
second
;
return
nullptr
;
}
private:
private:
// This method takes ownership of `node`.
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
nodes_
[
node
].
reset
(
node
);
nodes_
[
node
].
reset
(
node
);
node_set_
.
insert
(
node
);
node_set_
.
insert
(
node
);
PADDLE_ENFORCE
(
!
id2node_
.
count
(
node
->
id
()),
"duplicate id %d"
,
node
->
id
());
id2node_
[
node
->
id
()]
=
node
;
return
node
;
return
node
;
}
}
...
@@ -157,6 +166,8 @@ class Graph {
...
@@ -157,6 +166,8 @@ class Graph {
std
::
map
<
std
::
string
,
std
::
function
<
void
(
void
)
>>
attr_dels_
;
std
::
map
<
std
::
string
,
std
::
function
<
void
(
void
)
>>
attr_dels_
;
std
::
map
<
ir
::
Node
*
,
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
map
<
ir
::
Node
*
,
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
unordered_set
<
ir
::
Node
*>
node_set_
;
std
::
unordered_set
<
ir
::
Node
*>
node_set_
;
std
::
map
<
int
,
Node
*>
id2node_
;
int
node_count_
{
0
};
};
};
bool
IsControlDepVar
(
const
ir
::
Node
&
var
);
bool
IsControlDepVar
(
const
ir
::
Node
&
var
);
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
b98b7440
...
@@ -103,10 +103,10 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
...
@@ -103,10 +103,10 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
adj_list
[
n
].
insert
(
adj_n
);
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
}
}
}
}
}
}
...
...
paddle/fluid/framework/ir/graph_pattern_detect
e
r.cc
→
paddle/fluid/framework/ir/graph_pattern_detect
o
r.cc
浏览文件 @
b98b7440
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detect
e
r.h"
#include "paddle/fluid/framework/ir/graph_pattern_detect
o
r.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -34,7 +34,7 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
...
@@ -34,7 +34,7 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
name
);
name
);
}
}
nodes_
.
emplace_back
(
new
PDNode
(
std
::
move
(
teller
),
name
));
nodes_
.
emplace_back
(
new
PDNode
(
std
::
move
(
teller
),
this
,
name
));
auto
*
cur
=
nodes_
.
back
().
get
();
auto
*
cur
=
nodes_
.
back
().
get
();
node_map_
[
name
]
=
cur
;
node_map_
[
name
]
=
cur
;
return
cur
;
return
cur
;
...
@@ -56,19 +56,22 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
...
@@ -56,19 +56,22 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
edges_
.
emplace_back
(
a
,
b
);
edges_
.
emplace_back
(
a
,
b
);
}
}
void
GraphPatternDetect
e
r
::
operator
()(
Graph
*
graph
,
void
GraphPatternDetect
o
r
::
operator
()(
Graph
*
graph
,
GraphPatternDetect
e
r
::
handle_t
handler
)
{
GraphPatternDetect
o
r
::
handle_t
handler
)
{
if
(
!
MarkPDNodesInGraph
(
*
graph
))
return
;
if
(
!
MarkPDNodesInGraph
(
*
graph
))
return
;
auto
subgraphs
=
DetectPatterns
();
auto
subgraphs
=
DetectPatterns
();
UniquePatterns
(
&
subgraphs
);
UniquePatterns
(
&
subgraphs
);
RemoveOverlappedMatch
(
&
subgraphs
);
RemoveOverlappedMatch
(
&
subgraphs
);
LOG
(
INFO
)
<<
"detect "
<<
subgraphs
.
size
()
<<
" subgraph matches the pattern"
;
int
id
=
0
;
for
(
auto
&
g
:
subgraphs
)
{
for
(
auto
&
g
:
subgraphs
)
{
LOG
(
INFO
)
<<
"optimizing #"
<<
id
++
<<
" subgraph"
;
handler
(
g
,
graph
);
handler
(
g
,
graph
);
}
}
}
}
bool
GraphPatternDetect
e
r
::
MarkPDNodesInGraph
(
const
ir
::
Graph
&
graph
)
{
bool
GraphPatternDetect
o
r
::
MarkPDNodesInGraph
(
const
ir
::
Graph
&
graph
)
{
VLOG
(
4
)
<<
"mark pdnodes in graph"
;
VLOG
(
4
)
<<
"mark pdnodes in graph"
;
if
(
graph
.
Nodes
().
empty
())
return
false
;
if
(
graph
.
Nodes
().
empty
())
return
false
;
...
@@ -114,13 +117,15 @@ bool IsNodesLink(Node* a, Node* b) {
...
@@ -114,13 +117,15 @@ bool IsNodesLink(Node* a, Node* b) {
return
false
;
return
false
;
}
}
std
::
vector
<
GraphPatternDetect
e
r
::
subgraph_t
>
std
::
vector
<
GraphPatternDetect
o
r
::
subgraph_t
>
GraphPatternDetect
e
r
::
DetectPatterns
()
{
GraphPatternDetect
o
r
::
DetectPatterns
()
{
// Init empty subgraphs.
// Init empty subgraphs.
std
::
vector
<
GraphPatternDetect
e
r
::
subgraph_t
>
result
;
std
::
vector
<
GraphPatternDetect
o
r
::
subgraph_t
>
result
;
std
::
vector
<
HitGroup
>
init_groups
;
std
::
vector
<
HitGroup
>
init_groups
;
PADDLE_ENFORCE
(
!
pattern_
.
edges
().
empty
(),
"At least one edge is needed"
);
std
::
array
<
std
::
vector
<
HitGroup
>
,
2
>
bi_records
;
auto
*
first_pnode
=
pattern_
.
edges
().
front
().
first
;
// PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
auto
*
first_pnode
=
pattern_
.
edges
().
empty
()
?
pattern
().
nodes
().
front
().
get
()
:
pattern_
.
edges
().
front
().
first
;
if
(
!
pdnodes2nodes_
.
count
(
first_pnode
))
return
result
;
if
(
!
pdnodes2nodes_
.
count
(
first_pnode
))
return
result
;
for
(
auto
*
node
:
pdnodes2nodes_
[
first_pnode
])
{
for
(
auto
*
node
:
pdnodes2nodes_
[
first_pnode
])
{
HitGroup
group
;
HitGroup
group
;
...
@@ -129,7 +134,6 @@ GraphPatternDetecter::DetectPatterns() {
...
@@ -129,7 +134,6 @@ GraphPatternDetecter::DetectPatterns() {
}
}
int
step
=
0
;
int
step
=
0
;
std
::
array
<
std
::
vector
<
HitGroup
>
,
2
>
bi_records
;
bi_records
[
0
]
=
std
::
move
(
init_groups
);
bi_records
[
0
]
=
std
::
move
(
init_groups
);
// Extend a PDNode to subgraphs by deducing the connection relations defined
// Extend a PDNode to subgraphs by deducing the connection relations defined
...
@@ -141,6 +145,7 @@ GraphPatternDetecter::DetectPatterns() {
...
@@ -141,6 +145,7 @@ GraphPatternDetecter::DetectPatterns() {
auto
&
pre_groups
=
bi_records
[
step
%
2
];
auto
&
pre_groups
=
bi_records
[
step
%
2
];
auto
&
cur_groups
=
bi_records
[
1
-
(
step
++
%
2
)];
auto
&
cur_groups
=
bi_records
[
1
-
(
step
++
%
2
)];
cur_groups
.
clear
();
cur_groups
.
clear
();
if
(
pre_groups
.
empty
())
break
;
// source -> target
// source -> target
for
(
Node
*
source
:
pdnodes2nodes_
[
edge
.
first
])
{
for
(
Node
*
source
:
pdnodes2nodes_
[
edge
.
first
])
{
for
(
Node
*
target
:
pdnodes2nodes_
[
edge
.
second
])
{
for
(
Node
*
target
:
pdnodes2nodes_
[
edge
.
second
])
{
...
@@ -163,7 +168,7 @@ GraphPatternDetecter::DetectPatterns() {
...
@@ -163,7 +168,7 @@ GraphPatternDetecter::DetectPatterns() {
}
}
for
(
auto
&
group
:
bi_records
[
step
%
2
])
{
for
(
auto
&
group
:
bi_records
[
step
%
2
])
{
GraphPatternDetect
e
r
::
subgraph_t
subgraph
;
GraphPatternDetect
o
r
::
subgraph_t
subgraph
;
for
(
auto
&
role
:
group
.
roles
)
{
for
(
auto
&
role
:
group
.
roles
)
{
subgraph
.
emplace
(
role
.
first
,
role
.
second
);
subgraph
.
emplace
(
role
.
first
,
role
.
second
);
}
}
...
@@ -172,10 +177,10 @@ GraphPatternDetecter::DetectPatterns() {
...
@@ -172,10 +177,10 @@ GraphPatternDetecter::DetectPatterns() {
return
result
;
return
result
;
}
}
void
GraphPatternDetect
e
r
::
UniquePatterns
(
void
GraphPatternDetect
o
r
::
UniquePatterns
(
std
::
vector
<
GraphPatternDetect
e
r
::
subgraph_t
>*
subgraphs
)
{
std
::
vector
<
GraphPatternDetect
o
r
::
subgraph_t
>*
subgraphs
)
{
if
(
subgraphs
->
empty
())
return
;
if
(
subgraphs
->
empty
())
return
;
std
::
vector
<
GraphPatternDetect
e
r
::
subgraph_t
>
result
;
std
::
vector
<
GraphPatternDetect
o
r
::
subgraph_t
>
result
;
std
::
unordered_set
<
size_t
>
set
;
std
::
unordered_set
<
size_t
>
set
;
for
(
auto
&
g
:
*
subgraphs
)
{
for
(
auto
&
g
:
*
subgraphs
)
{
...
@@ -192,7 +197,7 @@ void GraphPatternDetecter::UniquePatterns(
...
@@ -192,7 +197,7 @@ void GraphPatternDetecter::UniquePatterns(
*
subgraphs
=
result
;
*
subgraphs
=
result
;
}
}
void
GraphPatternDetect
e
r
::
RemoveOverlappedMatch
(
void
GraphPatternDetect
o
r
::
RemoveOverlappedMatch
(
std
::
vector
<
subgraph_t
>*
subgraphs
)
{
std
::
vector
<
subgraph_t
>*
subgraphs
)
{
std
::
vector
<
subgraph_t
>
result
;
std
::
vector
<
subgraph_t
>
result
;
std
::
unordered_set
<
Node
*>
node_set
;
std
::
unordered_set
<
Node
*>
node_set
;
...
@@ -215,6 +220,46 @@ void GraphPatternDetecter::RemoveOverlappedMatch(
...
@@ -215,6 +220,46 @@ void GraphPatternDetecter::RemoveOverlappedMatch(
*
subgraphs
=
result
;
*
subgraphs
=
result
;
}
}
std
::
string
PDPattern
::
DotString
()
const
{
using
inference
::
analysis
::
Dot
;
Dot
dot
;
int
id
=
0
;
// Create Nodes
std
::
unordered_map
<
PDNode
*
,
std
::
string
>
node2dot
;
for
(
const
auto
&
node
:
nodes
())
{
std
::
string
node_id
=
"Node"
+
std
::
to_string
(
id
++
);
dot
.
AddNode
(
node_id
,
{},
node
->
name
());
node2dot
[
node
.
get
()]
=
node_id
;
}
// Create Edges
for
(
const
auto
&
edge
:
edges
())
{
if
(
!
node2dot
.
count
(
edge
.
first
)
||
!
node2dot
.
count
(
edge
.
second
))
{
LOG
(
ERROR
)
<<
"no node "
<<
edge
.
first
<<
" "
<<
edge
.
second
;
continue
;
}
auto
&
src
=
node2dot
.
at
(
edge
.
first
);
auto
&
trg
=
node2dot
.
at
(
edge
.
second
);
dot
.
AddEdge
(
src
,
trg
,
{});
}
return
dot
.
Build
();
}
PDNode
&
PDNode
::
LinksTo
(
const
std
::
vector
<
PDNode
*>&
others
)
{
// extend outlinks.
for
(
PDNode
*
x
:
others
)
{
pattern_
->
AddEdge
(
this
,
x
);
}
return
*
this
;
}
PDNode
&
PDNode
::
LinksFrom
(
const
std
::
vector
<
PDNode
*>&
others
)
{
// extend outlinks.
for
(
PDNode
*
x
:
others
)
{
pattern_
->
AddEdge
(
x
,
this
);
}
return
*
this
;
}
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detect
e
r.h
→
paddle/fluid/framework/ir/graph_pattern_detect
o
r.h
浏览文件 @
b98b7440
...
@@ -21,12 +21,14 @@
...
@@ -21,12 +21,14 @@
#include <numeric>
#include <numeric>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
class
PDPattern
;
// Some basic t
orminoly
gies:
// Some basic t
erminolo
gies:
// - PDPattern: a pattern defined as a data flow graph.
// - PDPattern: a pattern defined as a data flow graph.
// - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
// - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
// that meets some conditions defined in `PDNode.teller`.
// that meets some conditions defined in `PDNode.teller`.
...
@@ -36,30 +38,43 @@ namespace ir {
...
@@ -36,30 +38,43 @@ namespace ir {
struct
PDNode
{
struct
PDNode
{
// tell whether an ir::Node* is a candidation for a PDNode.
// tell whether an ir::Node* is a candidation for a PDNode.
using
teller_t
=
std
::
function
<
bool
(
Node
*
)
>
;
using
teller_t
=
std
::
function
<
bool
(
Node
*
)
>
;
enum
class
Type
{
kOp
,
kVar
};
PDNode
(
teller_t
&&
teller
,
const
std
::
string
&
name
=
""
)
// this link to others
:
teller_
(
teller
),
name_
(
name
)
{
PDNode
&
LinksTo
(
const
std
::
vector
<
PDNode
*>&
others
);
PADDLE_ENFORCE
(
teller_
!=
nullptr
,
"invalid teller functer is set."
);
PDNode
&
LinksFrom
(
const
std
::
vector
<
PDNode
*>&
others
);
}
PDNode
(
PDNode
&&
other
)
=
default
;
std
::
vector
<
PDNode
*>
inlinks
;
std
::
vector
<
PDNode
*>
outlinks
;
bool
Tell
(
Node
*
node
)
const
{
bool
Tell
(
Node
*
node
)
const
{
PADDLE_ENFORCE
(
teller_
!=
nullptr
,
"teller should be set for a PDNode"
);
PADDLE_ENFORCE
(
teller_
!=
nullptr
,
"teller should be set for a PDNode"
);
return
teller_
(
node
);
return
teller_
(
node
);
}
}
bool
IsOp
()
const
{
return
type_
==
Type
::
kOp
;
}
bool
IsVar
()
const
{
return
type_
==
Type
::
kVar
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
PDNode
(
const
PDNode
&
)
=
delete
;
PDNode
(
const
PDNode
&
)
=
delete
;
PDNode
&
operator
=
(
const
PDNode
&
)
=
delete
;
PDNode
&
operator
=
(
const
PDNode
&
)
=
delete
;
private:
private:
PDNode
(
teller_t
&&
teller
,
PDPattern
*
pattern
,
const
std
::
string
&
name
=
""
,
Type
type
=
Type
::
kVar
)
:
teller_
(
std
::
move
(
teller
)),
pattern_
(
pattern
),
name_
(
name
),
type_
(
type
)
{
PADDLE_ENFORCE
(
teller_
!=
nullptr
,
"invalid teller functer is set."
);
}
PDNode
(
PDNode
&&
other
)
=
default
;
friend
class
PDPattern
;
teller_t
teller_
;
teller_t
teller_
;
PDPattern
*
pattern_
;
std
::
string
name_
;
std
::
string
name_
;
Type
type_
;
};
};
/*
/*
...
@@ -102,6 +117,8 @@ class PDPattern {
...
@@ -102,6 +117,8 @@ class PDPattern {
const
std
::
vector
<
std
::
unique_ptr
<
PDNode
>>&
nodes
()
const
{
return
nodes_
;
}
const
std
::
vector
<
std
::
unique_ptr
<
PDNode
>>&
nodes
()
const
{
return
nodes_
;
}
const
std
::
vector
<
edge_t
>&
edges
()
const
{
return
edges_
;
}
const
std
::
vector
<
edge_t
>&
edges
()
const
{
return
edges_
;
}
std
::
string
DotString
()
const
;
private:
private:
#ifdef PADDLE_WITH_TESTING
#ifdef PADDLE_WITH_TESTING
FRIEND_TEST
(
PDPattern
,
AddEdge
);
FRIEND_TEST
(
PDPattern
,
AddEdge
);
...
@@ -117,7 +134,7 @@ class PDPattern {
...
@@ -117,7 +134,7 @@ class PDPattern {
};
};
/*
/*
* GraphPatternDetect
e
r helps to detect the specific patterns in the graph.
* GraphPatternDetect
o
r helps to detect the specific patterns in the graph.
* Input a pattern, output a list of the matched subgraphs/nodes.
* Input a pattern, output a list of the matched subgraphs/nodes.
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
*
*
...
@@ -129,7 +146,7 @@ class PDPattern {
...
@@ -129,7 +146,7 @@ class PDPattern {
*
*
* Usage:
* Usage:
* // Create a detector
* // Create a detector
* GraphPatternDetect
e
r detector;
* GraphPatternDetect
o
r detector;
* // Define the detector's pattern, by adding PDNode and define the edges.
* // Define the detector's pattern, by adding PDNode and define the edges.
* auto* node0 = detector.mutable_pattern().AddNode(...)
* auto* node0 = detector.mutable_pattern().AddNode(...)
* auto* node1 = detector.mutable_pattern().AddNode(...)
* auto* node1 = detector.mutable_pattern().AddNode(...)
...
@@ -138,11 +155,11 @@ class PDPattern {
...
@@ -138,11 +155,11 @@ class PDPattern {
* detector.mutable_pattern().AddEdge(node0, node1);
* detector.mutable_pattern().AddEdge(node0, node1);
* // Create an handler, to define the behavior of treating the filtered
* // Create an handler, to define the behavior of treating the filtered
* // subgraphs that comply with the patterns.
* // subgraphs that comply with the patterns.
* GraphPatternDetect
e
r::handle_t handler = some labmda
* GraphPatternDetect
o
r::handle_t handler = some labmda
* // Execute the detector.
* // Execute the detector.
* detector(&graph, handler);
* detector(&graph, handler);
*/
*/
class
GraphPatternDetect
e
r
{
class
GraphPatternDetect
o
r
{
public:
public:
using
subgraph_t
=
std
::
unordered_map
<
PDNode
*
,
Node
*>
;
using
subgraph_t
=
std
::
unordered_map
<
PDNode
*
,
Node
*>
;
...
@@ -177,10 +194,62 @@ class GraphPatternDetecter {
...
@@ -177,10 +194,62 @@ class GraphPatternDetecter {
using
hit_rcd_t
=
using
hit_rcd_t
=
std
::
pair
<
Node
*
/*node in graph*/
,
PDNode
*
/*node in pattern*/
>
;
std
::
pair
<
Node
*
/*node in graph*/
,
PDNode
*
/*node in pattern*/
>
;
PDPattern
pattern_
;
PDPattern
pattern_
;
std
::
vector
<
hit_rcd_t
>
marked_records_
;
std
::
unordered_map
<
const
PDNode
*
,
std
::
unordered_set
<
Node
*>>
pdnodes2nodes_
;
std
::
unordered_map
<
const
PDNode
*
,
std
::
unordered_set
<
Node
*>>
pdnodes2nodes_
;
};
};
// some helper methods.
// Op's input.
static
bool
VarLinksToOp
(
Node
*
node
,
const
std
::
string
&
op_type
)
{
for
(
auto
*
out
:
node
->
outputs
)
{
if
(
out
->
IsOp
()
&&
out
->
Op
()
->
Type
()
==
op_type
)
{
return
true
;
}
}
return
false
;
}
// Op's output.
static
bool
VarLinksFromOp
(
Node
*
node
,
const
std
::
string
&
op_type
)
{
for
(
auto
*
out
:
node
->
inputs
)
{
if
(
out
->
IsOp
()
&&
out
->
Op
()
->
Type
()
==
op_type
)
{
return
true
;
}
}
return
false
;
}
// Check whether a var node is a op node's nth input.
static
bool
IsNthInput
(
Node
*
var
,
Node
*
op
,
const
std
::
string
&
argument
,
size_t
nth
)
{
PADDLE_ENFORCE
(
var
->
IsVar
());
PADDLE_ENFORCE
(
op
->
IsOp
());
if
(
op
->
inputs
.
size
()
<=
nth
)
return
false
;
return
var
->
Name
()
==
op
->
Op
()
->
Input
(
argument
)[
nth
];
}
static
void
GraphSafeRemoveNodes
(
Graph
*
graph
,
const
std
::
unordered_set
<
const
Node
*>&
nodes
)
{
for
(
auto
*
node
:
nodes
)
{
graph
->
RemoveNode
(
const_cast
<
Node
*>
(
node
));
}
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
it
=
node
->
inputs
.
begin
();
it
!=
node
->
inputs
.
end
();)
{
if
(
nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
inputs
.
erase
(
it
);
}
else
it
++
;
}
for
(
auto
it
=
node
->
outputs
.
begin
();
it
!=
node
->
outputs
.
end
();)
{
if
(
nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
outputs
.
erase
(
it
);
}
else
it
++
;
}
}
}
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detect
e
r_tester.cc
→
paddle/fluid/framework/ir/graph_pattern_detect
o
r_tester.cc
浏览文件 @
b98b7440
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/ir/graph_pattern_detect
e
r.h"
#include "paddle/fluid/framework/ir/graph_pattern_detect
o
r.h"
#include <gtest/gtest.h>
#include <gtest/gtest.h>
...
@@ -82,7 +82,7 @@ TEST(PDPattern, AddEdge) {
...
@@ -82,7 +82,7 @@ TEST(PDPattern, AddEdge) {
}
}
TEST
(
GraphPatternDetecter
,
MarkPDNodesInGraph
)
{
TEST
(
GraphPatternDetecter
,
MarkPDNodesInGraph
)
{
GraphPatternDetect
e
r
x
;
GraphPatternDetect
o
r
x
;
// mark o2, o3, v2
// mark o2, o3, v2
// The pattern is a graph:
// The pattern is a graph:
...
@@ -131,7 +131,7 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
...
@@ -131,7 +131,7 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
Graph
graph
(
program
);
Graph
graph
(
program
);
BuildGraph
(
&
graph
);
BuildGraph
(
&
graph
);
GraphPatternDetect
e
r
x
;
GraphPatternDetect
o
r
x
;
// The pattern is a graph:
// The pattern is a graph:
// op -> var
// op -> var
...
@@ -149,8 +149,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
...
@@ -149,8 +149,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
x
.
mutable_pattern
()
->
AddEdge
(
any_var
,
any_op1
);
x
.
mutable_pattern
()
->
AddEdge
(
any_var
,
any_op1
);
int
count
=
0
;
int
count
=
0
;
GraphPatternDetect
e
r
::
handle_t
handle
=
[
&
](
GraphPatternDetect
o
r
::
handle_t
handle
=
[
&
](
const
GraphPatternDetect
e
r
::
subgraph_t
&
s
,
Graph
*
g
)
{
const
GraphPatternDetect
o
r
::
subgraph_t
&
s
,
Graph
*
g
)
{
LOG
(
INFO
)
<<
"Detect "
<<
s
.
at
(
any_op
)
->
Name
()
<<
" -> "
LOG
(
INFO
)
<<
"Detect "
<<
s
.
at
(
any_op
)
->
Name
()
<<
" -> "
<<
s
.
at
(
any_var
)
->
Name
()
<<
" -> "
<<
s
.
at
(
any_op1
)
->
Name
();
<<
s
.
at
(
any_var
)
->
Name
()
<<
" -> "
<<
s
.
at
(
any_op1
)
->
Name
();
count
++
;
count
++
;
...
...
paddle/fluid/framework/ir/graph_to_program_pass.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
std
::
unique_ptr
<
Graph
>
GraphToProgramPass
::
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
{
ProgramDesc
&
program
=
Get
<
ProgramDesc
>
(
"program"
);
std
::
unique_ptr
<
proto
::
ProgramDesc
>
program_pb
(
new
proto
::
ProgramDesc
(
*
program
.
Proto
()));
auto
block
=
program_pb
->
mutable_blocks
(
kRootBlockIndex
);
block
->
clear_vars
();
std
::
unordered_set
<
std
::
string
>
visited_vars
;
for
(
ir
::
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
NodeType
()
==
ir
::
Node
::
Type
::
kVariable
)
{
if
(
n
->
Var
()
&&
visited_vars
.
count
(
n
->
Var
()
->
Name
())
==
0
)
{
visited_vars
.
insert
(
n
->
Var
()
->
Name
());
block
->
add_vars
()
->
MergeFrom
(
*
n
->
Var
()
->
Proto
());
}
}
}
block
->
clear_ops
();
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
!
n
->
Op
())
{
continue
;
}
block
->
add_ops
()
->
MergeFrom
(
*
n
->
Op
()
->
Proto
());
}
program
.
CopyFrom
(
*
program_pb
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
graph_to_program_pass
,
paddle
::
framework
::
ir
::
GraphToProgramPass
);
paddle/fluid/framework/ir/graph_to_program_pass.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
GraphToProgramPass
:
public
Pass
{
protected:
std
::
unique_ptr
<
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_to_program_pass_test.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
BuildNoCircleGraph
(
Graph
*
g
)
{
OpDesc
op1
;
op1
.
SetType
(
"op1"
);
OpDesc
op2
;
op2
.
SetType
(
"op2"
);
OpDesc
op3
;
op3
.
SetType
(
"op3"
);
OpDesc
op4
;
op4
.
SetType
(
"op4"
);
OpDesc
op5
;
op5
.
SetType
(
"op5"
);
VarDesc
var1
(
"var1"
);
VarDesc
var2
(
"var2"
);
VarDesc
var3
(
"var3"
);
VarDesc
var4
(
"var4"
);
ir
::
Node
*
o1
=
g
->
CreateOpNode
(
&
op1
);
ir
::
Node
*
o2
=
g
->
CreateOpNode
(
&
op2
);
ir
::
Node
*
o3
=
g
->
CreateOpNode
(
&
op3
);
ir
::
Node
*
o4
=
g
->
CreateOpNode
(
&
op4
);
ir
::
Node
*
o5
=
g
->
CreateOpNode
(
&
op5
);
ir
::
Node
*
v1
=
g
->
CreateVarNode
(
&
var1
);
ir
::
Node
*
v2
=
g
->
CreateVarNode
(
&
var2
);
ir
::
Node
*
v3
=
g
->
CreateVarNode
(
&
var3
);
ir
::
Node
*
v4
=
g
->
CreateVarNode
(
&
var4
);
// o1->v1->o2
o1
->
outputs
.
push_back
(
v1
);
o2
->
inputs
.
push_back
(
v1
);
v1
->
inputs
.
push_back
(
o1
);
v1
->
outputs
.
push_back
(
o2
);
// o2->v2->o3
// o2->v2->o4
o2
->
outputs
.
push_back
(
v2
);
o3
->
inputs
.
push_back
(
v2
);
o4
->
inputs
.
push_back
(
v2
);
v2
->
outputs
.
push_back
(
o3
);
v2
->
outputs
.
push_back
(
o4
);
v2
->
inputs
.
push_back
(
o2
);
// o2->v3->o5
o2
->
outputs
.
push_back
(
v3
);
o5
->
inputs
.
push_back
(
v3
);
v3
->
inputs
.
push_back
(
o2
);
v3
->
outputs
.
push_back
(
o5
);
// o3-v4->o5
o3
->
outputs
.
push_back
(
v4
);
o5
->
inputs
.
push_back
(
v4
);
v4
->
inputs
.
push_back
(
o3
);
v4
->
outputs
.
push_back
(
o5
);
}
TEST
(
GraphToProgramPass
,
Basic
)
{
ProgramDesc
prog
;
std
::
unique_ptr
<
Graph
>
g
(
new
Graph
(
prog
));
BuildNoCircleGraph
(
g
.
get
());
auto
pass
=
paddle
::
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
ProgramDesc
compiled_prog
;
pass
->
SetNotOwned
<
paddle
::
framework
::
ProgramDesc
>
(
"program"
,
&
compiled_prog
);
pass
->
Apply
(
std
::
move
(
g
));
std
::
vector
<
OpDesc
*>
ops
=
compiled_prog
.
Block
(
0
).
AllOps
();
EXPECT_EQ
(
ops
[
0
]
->
Type
(),
"op1"
);
EXPECT_EQ
(
ops
[
1
]
->
Type
(),
"op2"
);
if
(
ops
[
2
]
->
Type
()
==
"op3"
)
{
EXPECT_EQ
(
ops
[
3
]
->
Type
(),
"op4"
);
}
else
if
(
ops
[
2
]
->
Type
()
==
"op4"
)
{
EXPECT_EQ
(
ops
[
3
]
->
Type
(),
"op3"
);
}
EXPECT_EQ
(
ops
[
4
]
->
Type
(),
"op5"
);
std
::
unordered_set
<
std
::
string
>
vars
;
for
(
VarDesc
*
v
:
compiled_prog
.
Block
(
0
).
AllVars
())
{
vars
.
insert
(
v
->
Name
());
}
EXPECT_TRUE
(
vars
.
find
(
"var1"
)
!=
vars
.
end
());
EXPECT_TRUE
(
vars
.
find
(
"var2"
)
!=
vars
.
end
());
EXPECT_TRUE
(
vars
.
find
(
"var3"
)
!=
vars
.
end
());
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
graph_to_program_pass
);
paddle/fluid/framework/ir/graph_viz_pass.cc
浏览文件 @
b98b7440
...
@@ -16,11 +16,13 @@ limitations under the License. */
...
@@ -16,11 +16,13 @@ limitations under the License. */
#include <unordered_set>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/inference/analysis/dot.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
static
const
char
kGraphVizPath
[]
=
"graph_viz_path"
;
static
const
char
kGraphVizPath
[]
=
"graph_viz_path"
;
using
inference
::
analysis
::
Dot
;
std
::
unique_ptr
<
ir
::
Graph
>
GraphVizPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
GraphVizPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
...
@@ -30,41 +32,65 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
...
@@ -30,41 +32,65 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
PADDLE_ENFORCE
(
fout
->
good
());
PADDLE_ENFORCE
(
fout
->
good
());
std
::
ostream
&
sout
=
*
fout
;
std
::
ostream
&
sout
=
*
fout
;
size_t
var_id
=
0
;
std
::
unordered_map
<
const
ir
::
Node
*
,
std
::
string
>
node2dot
;
std
::
unordered_map
<
const
ir
::
Node
*
,
size_t
>
vars
;
Dot
dot
;
sout
<<
"digraph G {
\n
"
;
std
::
vector
<
Dot
::
Attr
>
op_attrs
({
Dot
::
Attr
(
"style"
,
"filled"
),
for
(
const
ir
::
Node
*
n
:
graph
->
Nodes
())
{
Dot
::
Attr
(
"shape"
,
"box"
),
if
(
n
->
NodeType
()
!=
ir
::
Node
::
Type
::
kVariable
)
continue
;
Dot
::
Attr
(
"fillcolor"
,
"red"
)});
size_t
cur_var_id
=
var_id
++
;
std
::
vector
<
Dot
::
Attr
>
var_attrs
({
Dot
::
Attr
(
"style"
,
"filled,rounded"
),
vars
[
n
]
=
cur_var_id
;
// Dot::Attr("shape", "diamond"),
Dot
::
Attr
(
"fillcolor"
,
"yellow"
)});
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
"
<<
n
->
Name
()
<<
"
\"
]"
<<
std
::
endl
;
std
::
vector
<
Dot
::
Attr
>
marked_op_attrs
({
Dot
::
Attr
(
"style"
,
"filled"
),
Dot
::
Attr
(
"shape"
,
"box"
),
Dot
::
Attr
(
"fillcolor"
,
"lightgray"
)});
std
::
vector
<
Dot
::
Attr
>
marked_var_attrs
(
{
Dot
::
Attr
(
"style"
,
"filled,rounded"
),
// Dot::Attr("shape", "diamond"),
Dot
::
Attr
(
"fillcolor"
,
"lightgray"
)});
auto
marked_nodes
=
ConsumeMarkedNodes
(
graph
.
get
());
// Create nodes
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
std
::
string
node_id
=
n
->
Name
()
+
"("
+
std
::
to_string
(
n
->
id
())
+
")"
;
if
(
n
->
IsOp
())
{
decltype
(
op_attrs
)
attr
=
marked_nodes
.
count
(
n
)
?
marked_op_attrs
:
op_attrs
;
dot
.
AddNode
(
node_id
,
attr
,
node_id
);
}
else
if
(
n
->
IsVar
())
{
decltype
(
op_attrs
)
attr
=
marked_nodes
.
count
(
n
)
?
marked_var_attrs
:
var_attrs
;
dot
.
AddNode
(
node_id
,
attr
,
node_id
);
}
}
node2dot
[
n
]
=
node_id
;
size_t
op_id
=
0
;
for
(
const
ir
::
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
std
::
string
op_name
=
"op_"
+
std
::
to_string
(
op_id
++
);
sout
<<
op_name
<<
" [label=
\"
"
<<
n
->
Name
()
<<
"
\"
, shape=rect]"
<<
std
::
endl
;
for
(
auto
in
:
n
->
inputs
)
{
std
::
string
var_name
=
"var_"
+
std
::
to_string
(
vars
[
in
]);
sout
<<
var_name
<<
" -> "
<<
op_name
<<
std
::
endl
;
}
}
// Create edges
for
(
auto
out
:
n
->
outputs
)
{
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
std
::
string
var_name
=
"var_"
+
std
::
to_string
(
vars
[
out
]);
const
auto
&
src_id
=
node2dot
.
at
(
n
);
sout
<<
op_name
<<
" -> "
<<
var_name
<<
std
::
endl
;
for
(
auto
*
out
:
n
->
outputs
)
{
const
auto
&
trg_id
=
node2dot
.
at
(
out
);
dot
.
AddEdge
(
src_id
,
trg_id
,
{});
}
}
}
}
sout
<<
"}
\n
"
;
sout
<<
dot
.
Build
();
return
graph
;
return
graph
;
}
}
GraphVizPass
::
marked_nodes_t
GraphVizPass
::
ConsumeMarkedNodes
(
Graph
*
graph
)
const
{
marked_nodes_t
res
;
if
(
graph
->
Has
(
kGraphvizMarkedNodeAttr
))
{
auto
&
attr
=
graph
->
Get
<
marked_nodes_t
>
(
kGraphvizMarkedNodeAttr
);
res
=
attr
;
attr
.
clear
();
}
return
res
;
}
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/framework/ir/graph_viz_pass.h
浏览文件 @
b98b7440
...
@@ -27,10 +27,19 @@ namespace paddle {
...
@@ -27,10 +27,19 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
const
char
kGraphvizMarkedNodeAttr
[]
=
"__graphviz__marked_node__"
;
class
GraphVizPass
:
public
Pass
{
class
GraphVizPass
:
public
Pass
{
public:
using
marked_nodes_t
=
std
::
unordered_set
<
const
Node
*>
;
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
// Tell whether there are any marked nodes in the graph. Consume the
// corresponding attribute.
marked_nodes_t
ConsumeMarkedNodes
(
Graph
*
graph
)
const
;
};
};
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/node.h
浏览文件 @
b98b7440
...
@@ -29,20 +29,26 @@ class Node {
...
@@ -29,20 +29,26 @@ class Node {
enum
class
Type
{
kOperation
,
kVariable
};
enum
class
Type
{
kOperation
,
kVariable
};
static
constexpr
char
kControlDepVarName
[]
=
"__control_var"
;
static
constexpr
char
kControlDepVarName
[]
=
"__control_var"
;
explicit
Node
(
const
std
::
string
&
name
,
Type
type
)
explicit
Node
(
const
std
::
string
&
name
,
Type
type
,
int
id
=
-
1
)
:
name_
(
name
),
var_desc_
(
nullptr
),
op_desc_
(
nullptr
),
type_
(
type
)
{}
:
name_
(
name
),
var_desc_
(
nullptr
),
op_desc_
(
nullptr
),
type_
(
type
),
id_
(
id
)
{}
explicit
Node
(
VarDesc
*
var_desc
)
explicit
Node
(
VarDesc
*
var_desc
,
int
id
=
-
1
)
:
name_
(
var_desc
->
Name
()),
:
name_
(
var_desc
->
Name
()),
var_desc_
(
new
VarDesc
(
*
var_desc
)),
var_desc_
(
new
VarDesc
(
*
var_desc
)),
op_desc_
(
nullptr
),
op_desc_
(
nullptr
),
type_
(
Type
::
kVariable
)
{}
type_
(
Type
::
kVariable
),
id_
(
id
)
{}
explicit
Node
(
OpDesc
*
op_desc
)
explicit
Node
(
OpDesc
*
op_desc
,
int
id
=
-
1
)
:
name_
(
op_desc
->
Type
()),
:
name_
(
op_desc
->
Type
()),
var_desc_
(
nullptr
),
var_desc_
(
nullptr
),
op_desc_
(
new
OpDesc
(
*
op_desc
,
op_desc
->
Block
())),
op_desc_
(
new
OpDesc
(
*
op_desc
,
op_desc
->
Block
())),
type_
(
Type
::
kOperation
)
{}
type_
(
Type
::
kOperation
),
id_
(
id
)
{}
Type
NodeType
()
const
{
return
type_
;
}
Type
NodeType
()
const
{
return
type_
;
}
...
@@ -58,6 +64,8 @@ class Node {
...
@@ -58,6 +64,8 @@ class Node {
return
op_desc_
.
get
();
return
op_desc_
.
get
();
}
}
int
id
()
const
{
return
id_
;
}
bool
IsOp
()
const
{
return
type_
==
Type
::
kOperation
;
}
bool
IsOp
()
const
{
return
type_
==
Type
::
kOperation
;
}
bool
IsVar
()
const
{
return
type_
==
Type
::
kVariable
;
}
bool
IsVar
()
const
{
return
type_
==
Type
::
kVariable
;
}
...
@@ -69,6 +77,7 @@ class Node {
...
@@ -69,6 +77,7 @@ class Node {
std
::
unique_ptr
<
VarDesc
>
var_desc_
;
std
::
unique_ptr
<
VarDesc
>
var_desc_
;
std
::
unique_ptr
<
OpDesc
>
op_desc_
;
std
::
unique_ptr
<
OpDesc
>
op_desc_
;
Type
type_
;
Type
type_
;
int
id_
;
private:
private:
DISABLE_COPY_AND_ASSIGN
(
Node
);
DISABLE_COPY_AND_ASSIGN
(
Node
);
...
...
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
struct
FuseExpr
{};
// sequence expand, concat fuse pattern, return concat's output
PDNode
*
BuildSeqExpandConcatPattern
(
PDPattern
*
pattern
)
{
// The following operators will be fused:
// concat
// sequence_expand
// sequence_expand
// The following variables will be treat as inputs:
// concat mid input, 0th input for fused op
// sequence_expand input, 1th input for fused op
// sequence_expand input, 2th input for fused op
// The following variables will be treat as outputs:
// concat output
// So the following variables will be removed:
// sequence-expand output
// sequence-expand output
// Three operators
auto
*
sequence_expand0
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"sequence_expand"
;
},
"sequence_expand0"
);
auto
*
sequence_expand1
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"sequence_expand"
;
},
"sequence_expand1"
);
auto
*
concat
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"concat"
&&
// basic check
x
->
Op
()
->
Input
(
"X"
).
size
()
==
3
;
// Special case
},
"concat"
);
auto
*
sequence_expand0_in
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksToOp
(
x
,
"sequence_expand"
);
},
"sequence_expand0_in"
);
auto
*
sequence_expand1_in
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksToOp
(
x
,
"sequence_expand"
);
},
"sequence_expand1_in"
);
// The variables
auto
*
sequence_expand0_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksFromOp
(
x
,
"sequence_expand"
)
&&
// basic check
VarLinksToOp
(
x
,
"concat"
)
&&
// is concat's input
IsNthInput
(
x
,
x
->
outputs
[
0
],
"X"
,
1
);
// X[0]
},
"sequence_expand0_out"
);
auto
*
sequence_expand1_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksFromOp
(
x
,
"sequence_expand"
)
&&
// basic check
VarLinksToOp
(
x
,
"concat"
)
&&
// is concat's input
IsNthInput
(
x
,
x
->
outputs
[
0
],
"X"
,
2
);
// x[2]
},
"sequence_expand1_out"
);
auto
*
concat_in0
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksToOp
(
x
,
"concat"
);
},
"concat_in0"
);
auto
*
concat_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksFromOp
(
x
,
"concat"
);
},
"concat_out"
);
// Links
sequence_expand0
->
LinksFrom
({
sequence_expand0_in
})
.
LinksTo
({
sequence_expand0_out
});
sequence_expand1
->
LinksFrom
({
sequence_expand1_in
})
.
LinksTo
({
sequence_expand1_out
});
concat
->
LinksFrom
({
sequence_expand0_out
,
sequence_expand1_out
,
concat_in0
})
.
LinksTo
({
concat_out
});
return
concat_out
;
}
PDNode
*
BuildFCPattern
(
PDPattern
*
pattern
,
PDNode
*
fc_x
)
{
PDNode
*
fc_w
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
// basic
VarLinksToOp
(
x
,
"mul"
)
&&
// link
x
->
Var
()
->
Proto
()
->
persistable
();
// is a parameter
},
"fc_w"
);
PDNode
*
mul_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
// basic
VarLinksFromOp
(
x
,
"mul"
)
&&
// link
VarLinksToOp
(
x
,
"elementwise_add"
)
&&
//
!
x
->
Var
()
->
Proto
()
->
persistable
();
// is a parameter
},
"mul_out"
);
PDNode
*
fc_mul
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"mul"
;
// basic
},
"fc_mul"
);
PDNode
*
fc_bias
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
// basic
VarLinksToOp
(
x
,
"elementwise_add"
)
&&
// link
x
->
Var
()
->
Proto
()
->
persistable
();
// is a parameter
},
"fc_bias"
);
PDNode
*
elementwise_add
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"elementwise_add"
;
},
"elementwise_add"
);
PDNode
*
add_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
// basic
VarLinksFromOp
(
x
,
"elementwise_add"
)
&&
// link
!
x
->
Var
()
->
Proto
()
->
persistable
();
// is a parameter
},
"add_out"
);
std
::
set
<
std
::
string
>
acts
({
"sigmoid"
,
"tanh"
,
"relu"
,
"identity"
});
PDNode
*
act
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
acts
.
count
(
x
->
Op
()
->
Type
());
},
"act"
);
PDNode
*
fc_out
=
pattern
->
NewNode
(
[](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
// basic
!
x
->
Var
()
->
Proto
()
->
persistable
();
// is a parameter
},
"fc_out"
);
fc_mul
->
LinksFrom
({
fc_w
,
fc_x
}).
LinksTo
({
mul_out
});
elementwise_add
->
LinksFrom
({
mul_out
,
fc_bias
}).
LinksTo
({
add_out
});
act
->
LinksFrom
({
add_out
}).
LinksTo
({
fc_out
});
return
fc_out
;
}
std
::
unique_ptr
<
ir
::
Graph
>
SeqConcatFcFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
FusePassBase
::
Init
(
graph
.
get
());
GraphPatternDetector
detector
;
auto
*
pattern
=
detector
.
mutable_pattern
();
auto
*
concat_out
=
BuildSeqExpandConcatPattern
(
pattern
);
BuildFCPattern
(
pattern
,
concat_out
);
#define GET_NODE(id, pattern) \
PADDLE_ENFORCE(subgraph.count(pattern.RetriveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(pattern.RetriveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
detector
(
graph
.
get
(),
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
graph
)
{
VLOG
(
4
)
<<
"get one concat pattern"
;
// fc
GET_NODE
(
fc_w
,
detector
.
pattern
());
GET_NODE
(
fc_bias
,
detector
.
pattern
());
GET_NODE
(
act
,
detector
.
pattern
());
GET_NODE
(
fc_out
,
detector
.
pattern
());
// concat
GET_NODE
(
concat_in0
,
detector
.
pattern
());
GET_NODE
(
sequence_expand0_in
,
detector
.
pattern
());
GET_NODE
(
sequence_expand1_in
,
detector
.
pattern
());
OpDesc
op_desc
;
op_desc
.
SetType
(
"fusion_seqexpand_concat_fc"
);
op_desc
.
SetInput
(
"X"
,
{
concat_in0
->
Name
(),
sequence_expand0_in
->
Name
(),
sequence_expand1_in
->
Name
()});
op_desc
.
SetInput
(
"FCWeight"
,
{
fc_w
->
Name
()});
op_desc
.
SetInput
(
"FCBias"
,
{
fc_bias
->
Name
()});
const
std
::
string
fc_out_tmp
=
fc_out
->
Name
()
+
".tmp"
;
param_scope
()
->
Var
(
fc_out_tmp
)
->
GetMutable
<
framework
::
LoDTensor
>
();
op_desc
.
SetOutput
(
"FCOut"
,
{
fc_out_tmp
});
op_desc
.
SetOutput
(
"Out"
,
{
fc_out
->
Name
()});
op_desc
.
SetAttr
(
"fc_activation"
,
act
->
Op
()
->
Type
());
auto
*
op_node
=
graph
->
CreateOpNode
(
&
op_desc
);
// Add links
#define NODE_LINKS(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
NODE_LINKS
(
fc_w
,
op_node
);
NODE_LINKS
(
fc_bias
,
op_node
);
NODE_LINKS
(
concat_in0
,
op_node
);
NODE_LINKS
(
sequence_expand0_in
,
op_node
);
NODE_LINKS
(
sequence_expand1_in
,
op_node
);
NODE_LINKS
(
op_node
,
fc_out
);
// Clean nodes.
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
for
(
auto
&
item
:
subgraph
)
{
marked_nodes
.
insert
(
item
.
second
);
}
marked_nodes
.
erase
(
fc_w
);
marked_nodes
.
erase
(
fc_bias
);
marked_nodes
.
erase
(
concat_in0
);
marked_nodes
.
erase
(
sequence_expand0_in
);
marked_nodes
.
erase
(
sequence_expand1_in
);
marked_nodes
.
erase
(
fc_out
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
});
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
seq_concat_fc_fuse_pass
,
paddle
::
framework
::
ir
::
SeqConcatFcFusePass
);
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
SeqConcatFcFusePass
:
public
FusePassBase
{
public:
virtual
~
SeqConcatFcFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/op_desc.cc
浏览文件 @
b98b7440
...
@@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
...
@@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
need_update_
=
true
;
need_update_
=
true
;
}
}
OpDesc
::
OpDesc
(
const
OpDesc
&
other
,
BlockDesc
*
block
)
{
CopyFrom
(
other
);
block_
=
block
;
need_update_
=
true
;
}
void
OpDesc
::
CopyFrom
(
const
OpDesc
&
op_desc
)
{
void
OpDesc
::
CopyFrom
(
const
OpDesc
&
op_desc
)
{
desc_
.
set_type
(
op_desc
.
Type
());
desc_
.
set_type
(
op_desc
.
Type
());
inputs_
=
op_desc
.
inputs_
;
inputs_
=
op_desc
.
inputs_
;
...
@@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
...
@@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
for
(
const
proto
::
OpDesc
::
Attr
&
attr
:
desc_
.
attrs
())
{
for
(
const
proto
::
OpDesc
::
Attr
&
attr
:
desc_
.
attrs
())
{
std
::
string
attr_name
=
attr
.
name
();
std
::
string
attr_name
=
attr
.
name
();
// The sub_block referred to by the BLOCK attr hasn't been added
// The sub_block referred to by the BLOCK attr hasn't been added
// to ProgramDesc class yet, we skip setting BLOCK attr here.
// to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here.
if
(
attr
.
type
()
!=
proto
::
AttrType
::
BLOCK
)
{
if
(
attr
.
type
()
!=
proto
::
AttrType
::
BLOCK
&&
attr
.
type
()
!=
proto
::
AttrType
::
BLOCKS
)
{
attrs_
[
attr_name
]
=
GetAttrValue
(
attr
);
attrs_
[
attr_name
]
=
GetAttrValue
(
attr
);
}
}
}
}
...
...
paddle/fluid/framework/op_desc.h
浏览文件 @
b98b7440
...
@@ -37,11 +37,7 @@ class OpDesc {
...
@@ -37,11 +37,7 @@ class OpDesc {
explicit
OpDesc
(
BlockDesc
*
block
)
:
block_
(
block
)
{}
explicit
OpDesc
(
BlockDesc
*
block
)
:
block_
(
block
)
{}
OpDesc
(
const
OpDesc
&
other
,
BlockDesc
*
block
)
{
OpDesc
(
const
OpDesc
&
other
,
BlockDesc
*
block
);
*
this
=
other
;
block_
=
block
;
need_update_
=
true
;
}
void
CopyFrom
(
const
OpDesc
&
op_desc
);
void
CopyFrom
(
const
OpDesc
&
op_desc
);
...
...
paddle/fluid/framework/program_desc.cc
浏览文件 @
b98b7440
...
@@ -80,6 +80,12 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
...
@@ -80,6 +80,12 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
InitFromProto
();
InitFromProto
();
}
}
void
ProgramDesc
::
CopyFrom
(
const
proto
::
ProgramDesc
&
desc
)
{
blocks_
.
clear
();
desc_
=
desc
;
InitFromProto
();
}
ProgramDesc
::
ProgramDesc
(
const
std
::
string
&
binary_str
)
{
ProgramDesc
::
ProgramDesc
(
const
std
::
string
&
binary_str
)
{
PADDLE_ENFORCE
(
desc_
.
ParseFromString
(
binary_str
),
PADDLE_ENFORCE
(
desc_
.
ParseFromString
(
binary_str
),
"Fail to parse program_desc from binary string."
);
"Fail to parse program_desc from binary string."
);
...
@@ -111,10 +117,16 @@ void ProgramDesc::InitFromProto() {
...
@@ -111,10 +117,16 @@ void ProgramDesc::InitFromProto() {
const
std
::
vector
<
std
::
string
>
ProgramDesc
::
GetFeedTargetNames
()
{
const
std
::
vector
<
std
::
string
>
ProgramDesc
::
GetFeedTargetNames
()
{
auto
&
global_block
=
Block
(
0
);
auto
&
global_block
=
Block
(
0
);
// The order of feed_target_names must follow the index specified in `col`.
// since feed operator's order doesn't necessary follow 'col'.
std
::
vector
<
std
::
string
>
feed_target_names
;
std
::
vector
<
std
::
string
>
feed_target_names
;
for
(
auto
*
op
:
global_block
.
AllOps
())
{
for
(
auto
*
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
==
kFeedOpType
)
{
if
(
op
->
Type
()
==
kFeedOpType
)
{
feed_target_names
.
insert
(
feed_target_names
.
begin
(),
op
->
Output
(
"Out"
)[
0
]);
int
col
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
col
>=
feed_target_names
.
size
())
{
feed_target_names
.
resize
(
col
+
1
);
}
feed_target_names
[
col
]
=
op
->
Output
(
"Out"
)[
0
];
}
}
}
}
return
feed_target_names
;
return
feed_target_names
;
...
@@ -122,10 +134,16 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
...
@@ -122,10 +134,16 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
const
std
::
vector
<
std
::
string
>
ProgramDesc
::
GetFetchTargetNames
()
{
const
std
::
vector
<
std
::
string
>
ProgramDesc
::
GetFetchTargetNames
()
{
auto
&
global_block
=
Block
(
0
);
auto
&
global_block
=
Block
(
0
);
// The order of fetch_target_names must follow the index specified in `col`.
// since fetch operator's order doesn't necessary follow 'col'.
std
::
vector
<
std
::
string
>
fetch_target_names
;
std
::
vector
<
std
::
string
>
fetch_target_names
;
for
(
auto
*
op
:
global_block
.
AllOps
())
{
for
(
auto
*
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
==
kFetchOpType
)
{
if
(
op
->
Type
()
==
kFetchOpType
)
{
fetch_target_names
.
push_back
(
op
->
Input
(
"X"
)[
0
]);
int
col
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
col
>=
fetch_target_names
.
size
())
{
fetch_target_names
.
resize
(
col
+
1
);
}
fetch_target_names
[
col
]
=
op
->
Input
(
"X"
)[
0
];
}
}
}
}
return
fetch_target_names
;
return
fetch_target_names
;
...
...
paddle/fluid/framework/program_desc.h
浏览文件 @
b98b7440
...
@@ -53,6 +53,8 @@ class ProgramDesc {
...
@@ -53,6 +53,8 @@ class ProgramDesc {
void
Flush
();
void
Flush
();
void
CopyFrom
(
const
proto
::
ProgramDesc
&
desc
);
proto
::
ProgramDesc
*
Proto
();
proto
::
ProgramDesc
*
Proto
();
// The output variable of feed_op is referenced as feed_target.
// The output variable of feed_op is referenced as feed_target.
...
...
paddle/fluid/framework/tensor.cc
浏览文件 @
b98b7440
...
@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
...
@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
"When calling this method, the Tensor's numel must be "
"When calling this method, the Tensor's numel must be "
"equal or larger than zero. "
"equal or larger than zero. "
"Please check Tensor::Resize has been called first."
);
"Please check Tensor::Resize has been called first."
);
size_t
size
=
requested_size
?
requested_size
:
numel
()
*
SizeOfType
(
type
);
size_t
size
=
numel
()
*
SizeOfType
(
type
);
if
(
requested_size
)
{
PADDLE_ENFORCE_GE
(
requested_size
,
size
);
size
=
requested_size
;
}
/* some versions of boost::variant don't have operator!= */
/* some versions of boost::variant don't have operator!= */
if
(
holder_
==
nullptr
||
!
(
holder_
->
place
()
==
place
)
||
if
(
holder_
==
nullptr
||
!
(
holder_
->
place
()
==
place
)
||
holder_
->
size
()
<
size
+
offset_
)
{
holder_
->
size
()
<
size
+
offset_
)
{
...
...
paddle/fluid/framework/var_type.h
浏览文件 @
b98b7440
...
@@ -26,7 +26,7 @@ namespace paddle {
...
@@ -26,7 +26,7 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
template
<
typename
T
>
template
<
typename
T
>
bool
IsType
(
const
std
::
type_index
&
type_index
)
{
inline
bool
IsType
(
const
std
::
type_index
&
type_index
)
{
return
type_index
==
std
::
type_index
(
typeid
(
T
));
return
type_index
==
std
::
type_index
(
typeid
(
T
));
}
}
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
...
@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library
(
paddle_fluid_api
cc_library
(
paddle_fluid_api
SRCS io.cc
SRCS io.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OP_LIB
}
)
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OP_LIB
}
graph_to_program_pass
)
get_property
(
fluid_modules GLOBAL PROPERTY FLUID_MODULES
)
get_property
(
fluid_modules GLOBAL PROPERTY FLUID_MODULES
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
b98b7440
cc_library
(
ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass
)
cc_library
(
ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass
)
cc_library
(
analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
set
(
analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor
)
cc_library
(
analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc
analyzer.cc
helper.cc
helper.cc
# passes
# passes
...
@@ -10,11 +13,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
...
@@ -10,11 +13,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
tensorrt_subgraph_node_mark_pass.cc
tensorrt_subgraph_node_mark_pass.cc
fluid_to_ir_pass.cc
fluid_to_ir_pass.cc
model_store_pass.cc
model_store_pass.cc
DEPS
framework_proto proto_desc ir_pass_manager graph pass
)
DEPS
${
analysis_deps
}
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis
paddle_fluid
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
...
@@ -31,7 +34,7 @@ function (inference_analysis_test TARGET)
...
@@ -31,7 +34,7 @@ function (inference_analysis_test TARGET)
endif
()
endif
()
cc_test
(
${
TARGET
}
cc_test
(
${
TARGET
}
SRCS
"
${
analysis_test_SRCS
}
"
SRCS
"
${
analysis_test_SRCS
}
"
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detect
e
r pass
${
analysis_test_EXTRA_DEPS
}
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detect
o
r pass
${
analysis_test_EXTRA_DEPS
}
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
)
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
...
@@ -58,20 +61,25 @@ endif()
...
@@ -58,20 +61,25 @@ endif()
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
analysis_predictor
# ir
# ir
fc_fuse_pass
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
graph_viz_pass
infer_clean_graph_pass
infer_clean_graph_pass
graph_pattern_detect
e
r
graph_pattern_detect
o
r
infer_clean_graph_pass
infer_clean_graph_pass
attention_lstm_fuse_pass
paddle_inference_api
pass
pass
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
--infer_ditu_rnn_model=
${
DITU_INSTALL_DIR
}
/model
--infer_ditu_rnn_model=
${
DITU_INSTALL_DIR
}
/model
--infer_ditu_rnn_data=
${
DITU_INSTALL_DIR
}
/data.txt
)
--infer_ditu_rnn_data=
${
DITU_INSTALL_DIR
}
/data.txt
)
inference_analysis_test
(
test_data_flow_graph SRCS data_flow_graph_tester.cc
)
inference_analysis_test
(
test_data_flow_graph SRCS data_flow_graph_tester.cc
)
inference_analysis_test
(
test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc
)
inference_analysis_test
(
test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc
EXTRA_DEPS paddle_inference_api
)
inference_analysis_test
(
test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc
)
inference_analysis_test
(
test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc
EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc
)
inference_analysis_test
(
test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc
)
inference_analysis_test
(
test_subgraph_splitter SRCS subgraph_splitter_tester.cc
)
inference_analysis_test
(
test_subgraph_splitter SRCS subgraph_splitter_tester.cc
)
inference_analysis_test
(
test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc
)
inference_analysis_test
(
test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc
)
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
b98b7440
...
@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
...
@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"elementwise_add"
,
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
{
"elementwise_add"
,
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"depthwise_conv2d"
,
"batch_norm"
});
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
});
if
(
!
node
->
IsFunction
())
return
false
;
if
(
!
node
->
IsFunction
())
return
false
;
const
auto
*
func
=
static_cast
<
const
Function
*>
(
node
);
const
auto
*
func
=
static_cast
<
const
Function
*>
(
node
);
...
@@ -102,6 +102,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
...
@@ -102,6 +102,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
Analyzer
::
Analyzer
()
{
Register
(
"manager1"
,
new
DfgPassManagerImpl
);
}
Analyzer
::
Analyzer
()
{
Register
(
"manager1"
,
new
DfgPassManagerImpl
);
}
void
Analyzer
::
Run
(
Argument
*
argument
)
{
void
Analyzer
::
Run
(
Argument
*
argument
)
{
// Ungly support fluid-to-ir-pass
argument
->
Set
(
kFluidToIrPassesAttr
,
new
std
::
vector
<
std
::
string
>
({
// Manual update the passes here.
"graph_viz_pass"
,
//
"infer_clean_graph_pass"
,
"graph_viz_pass"
,
//
"attention_lstm_fuse_pass"
,
"graph_viz_pass"
,
//
"fc_lstm_fuse_pass"
,
"graph_viz_pass"
,
//
"seq_concat_fc_fuse_pass"
,
"graph_viz_pass"
,
//
"fc_fuse_pass"
,
"graph_viz_pass"
//
}));
for
(
auto
&
x
:
data_
)
{
for
(
auto
&
x
:
data_
)
{
PADDLE_ENFORCE
(
x
->
Initialize
(
argument
));
PADDLE_ENFORCE
(
x
->
Initialize
(
argument
));
x
->
RunAll
();
x
->
RunAll
();
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
b98b7440
...
@@ -20,6 +20,7 @@
...
@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string
(
infer_ditu_rnn_model
,
""
,
"model path for ditu RNN"
);
DEFINE_string
(
infer_ditu_rnn_model
,
""
,
"model path for ditu RNN"
);
DEFINE_string
(
infer_ditu_rnn_data
,
""
,
"data path for ditu RNN"
);
DEFINE_string
(
infer_ditu_rnn_data
,
""
,
"data path for ditu RNN"
);
...
@@ -264,39 +265,24 @@ void TestDituRNNPrediction(const std::string &model_path,
...
@@ -264,39 +265,24 @@ void TestDituRNNPrediction(const std::string &model_path,
const
std
::
string
&
data_path
,
int
batch_size
,
const
std
::
string
&
data_path
,
int
batch_size
,
bool
use_analysis
,
bool
activate_ir
,
bool
use_analysis
,
bool
activate_ir
,
int
num_times
=
1
)
{
int
num_times
=
1
)
{
FLAGS_IA_enable_ir
=
activate_ir
;
FLAGS_IA_enable_tensorrt_subgraph_engine
=
false
;
FLAGS_IA_output_storage_path
=
"./analysis.out"
;
std
::
string
model_out
;
if
(
use_analysis
)
{
Argument
argument
(
model_path
);
argument
.
model_output_store_path
.
reset
(
new
std
::
string
(
"./analysis.out"
));
Analyzer
analyzer
;
analyzer
.
Run
(
&
argument
);
// Should get the transformed model stored to ./analysis.out
model_out
=
"./analysis.out"
;
ASSERT_TRUE
(
PathExists
(
model_out
));
}
else
{
model_out
=
FLAGS_infer_ditu_rnn_model
;
}
NativeConfig
config
;
NativeConfig
config
;
config
.
prog_file
=
model_out
+
"/__model__"
;
config
.
prog_file
=
FLAGS_infer_ditu_rnn_model
+
"/__model__"
;
config
.
param_file
=
model_out
+
"/param"
;
config
.
param_file
=
FLAGS_infer_ditu_rnn_model
+
"/param"
;
config
.
use_gpu
=
false
;
config
.
use_gpu
=
false
;
config
.
device
=
0
;
config
.
device
=
0
;
config
.
specify_input_name
=
true
;
config
.
specify_input_name
=
true
;
auto
predictor
=
auto
base_
predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
std
::
vector
<
PaddleTensor
>
input_slots
;
std
::
vector
<
PaddleTensor
>
input_slots
;
DataRecord
data
(
data_path
,
batch_size
);
DataRecord
data
(
data_path
,
batch_size
);
// Prepare inputs.
// Prepare inputs.
PrepareInputs
(
&
input_slots
,
&
data
,
batch_size
);
PrepareInputs
(
&
input_slots
,
&
data
,
batch_size
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
,
base_outputs
;
base_predictor
->
Run
(
input_slots
,
&
base_outputs
);
Timer
timer
;
Timer
timer
;
timer
.
tic
();
timer
.
tic
();
...
@@ -308,37 +294,25 @@ void TestDituRNNPrediction(const std::string &model_path,
...
@@ -308,37 +294,25 @@ void TestDituRNNPrediction(const std::string &model_path,
<<
", latency: "
<<
timer
.
toc
()
/
num_times
<<
"ms"
;
<<
", latency: "
<<
timer
.
toc
()
/
num_times
<<
"ms"
;
LOG
(
INFO
)
<<
"====================================="
;
LOG
(
INFO
)
<<
"====================================="
;
for
(
auto
&
out
:
outputs
)
{
PADDLE_ENFORCE_GT
(
outputs
.
size
(),
0
);
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
base_outputs
.
size
());
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
i
++
)
{
auto
&
out
=
outputs
[
i
];
auto
&
base_out
=
base_outputs
[
i
];
size_t
size
=
std
::
accumulate
(
out
.
shape
.
begin
(),
out
.
shape
.
end
(),
1
,
size_t
size
=
std
::
accumulate
(
out
.
shape
.
begin
(),
out
.
shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
size_t
size1
=
std
::
accumulate
(
base_out
.
shape
.
begin
(),
base_out
.
shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
PADDLE_ENFORCE_EQ
(
size
,
size1
);
PADDLE_ENFORCE_GT
(
size
,
0
);
float
*
data
=
static_cast
<
float
*>
(
out
.
data
.
data
());
float
*
data
=
static_cast
<
float
*>
(
out
.
data
.
data
());
for
(
size_t
i
=
0
;
float
*
base_data
=
static_cast
<
float
*>
(
base_out
.
data
.
data
());
i
<
std
::
min
(
sizeof
(
ditu_rnn_target_data
)
/
sizeof
(
float
),
size
);
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
i
++
)
{
EXPECT_NEAR
(
data
[
i
],
base_data
[
i
],
1e-3
);
EXPECT_NEAR
(
data
[
i
],
ditu_rnn_target_data
[
i
],
1e-3
);
}
}
}
}
}
}
// Turn on the IR pass supportion, run a real inference and check the result.
TEST
(
Analyzer
,
SupportIRPass
)
{
FLAGS_IA_enable_ir
=
true
;
FLAGS_IA_enable_tensorrt_subgraph_engine
=
false
;
FLAGS_IA_output_storage_path
=
"./analysis.out"
;
Argument
argument
(
FLAGS_inference_model_dir
);
argument
.
model_output_store_path
.
reset
(
new
std
::
string
(
"./analysis.out"
));
Analyzer
analyzer
;
analyzer
.
Run
(
&
argument
);
// Should get the transformed model stored to ./analysis.out
ASSERT_TRUE
(
PathExists
(
"./analysis.out"
));
// Inference from this path.
TestWord2vecPrediction
(
"./analysis.out"
);
}
// Directly infer with the original model.
// Directly infer with the original model.
TEST
(
Analyzer
,
DituRNN_without_analysis
)
{
TEST
(
Analyzer
,
DituRNN_without_analysis
)
{
TestDituRNNPrediction
(
FLAGS_infer_ditu_rnn_model
,
FLAGS_infer_ditu_rnn_data
,
TestDituRNNPrediction
(
FLAGS_infer_ditu_rnn_model
,
FLAGS_infer_ditu_rnn_data
,
...
@@ -365,5 +339,8 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) {
...
@@ -365,5 +339,8 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) {
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
seq_concat_fc_fuse_pass
);
USE_PASS
(
fc_lstm_fuse_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
attention_lstm_fuse_pass
);
paddle/fluid/inference/analysis/argument.h
浏览文件 @
b98b7440
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
#include <string>
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/platform/variant.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -58,6 +59,46 @@ struct Argument {
...
@@ -58,6 +59,46 @@ struct Argument {
// The output storage path of ModelStorePass.
// The output storage path of ModelStorePass.
std
::
unique_ptr
<
std
::
string
>
model_output_store_path
;
std
::
unique_ptr
<
std
::
string
>
model_output_store_path
;
// Support for any other attributes.
template
<
typename
T
>
void
Set
(
const
std
::
string
&
key
,
T
*
data
)
{
PADDLE_ENFORCE_NOT_NULL
(
data
);
PADDLE_ENFORCE
(
!
attrs_
.
count
(
key
),
"duplicate attr called %s"
,
key
);
attrs_
[
key
]
=
data
;
attr_deleters_
[
key
]
=
[
data
,
key
,
this
]()
{
VLOG
(
3
)
<<
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
;
VLOG
(
3
)
<<
"argument delete attr: "
<<
key
;
delete
data
;
};
}
bool
Has
(
const
std
::
string
&
name
)
const
{
return
attrs_
.
count
(
name
);
}
template
<
typename
T
>
T
*
Release
(
const
std
::
string
&
key
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
key
));
auto
*
res
=
boost
::
any_cast
<
T
*>
(
attrs_
.
at
(
key
));
attrs_
.
erase
(
key
);
attr_deleters_
.
erase
(
key
);
return
res
;
}
template
<
typename
T
>
T
&
Get
(
const
std
::
string
&
key
)
{
PADDLE_ENFORCE
(
Has
(
key
));
return
*
boost
::
any_cast
<
T
*>
(
attrs_
.
at
(
key
));
}
~
Argument
()
{
for
(
auto
&
item
:
attr_deleters_
)
{
item
.
second
();
}
}
private:
std
::
unordered_map
<
std
::
string
,
boost
::
any
>
attrs_
;
std
::
unordered_map
<
std
::
string
,
std
::
function
<
void
()
>>
attr_deleters_
;
};
};
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
...
...
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
浏览文件 @
b98b7440
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/io.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -65,6 +66,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
...
@@ -65,6 +66,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
}
}
}
}
if
(
argument_
->
Has
(
"param_scope"
))
{
LOG
(
WARNING
)
<<
"parameter changes in the scope takes effect"
;
}
PADDLE_ENFORCE
(
argument_
->
transformed_program_desc
.
get
());
PADDLE_ENFORCE
(
argument_
->
transformed_program_desc
.
get
());
}
}
...
...
paddle/fluid/inference/analysis/dot.h
浏览文件 @
b98b7440
...
@@ -29,13 +29,13 @@ namespace paddle {
...
@@ -29,13 +29,13 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
static
size_t
dot_node_counter
{
0
};
/*
/*
* A Dot template that helps to build a DOT graph definition.
* A Dot template that helps to build a DOT graph definition.
*/
*/
class
Dot
{
class
Dot
{
public:
public:
static
size_t
counter
;
struct
Attr
{
struct
Attr
{
std
::
string
key
;
std
::
string
key
;
std
::
string
value
;
std
::
string
value
;
...
@@ -57,7 +57,7 @@ class Dot {
...
@@ -57,7 +57,7 @@ class Dot {
Node
(
const
std
::
string
&
name
,
const
std
::
vector
<
Attr
>&
attrs
)
Node
(
const
std
::
string
&
name
,
const
std
::
vector
<
Attr
>&
attrs
)
:
name
(
name
),
:
name
(
name
),
attrs
(
attrs
),
attrs
(
attrs
),
id_
(
"node_"
+
std
::
to_string
(
Dot
::
counter
++
))
{}
id_
(
"node_"
+
std
::
to_string
(
dot_node_
counter
++
))
{}
std
::
string
id
()
const
{
return
id_
;
}
std
::
string
id
()
const
{
return
id_
;
}
...
@@ -65,6 +65,10 @@ class Dot {
...
@@ -65,6 +65,10 @@ class Dot {
std
::
stringstream
ss
;
std
::
stringstream
ss
;
CHECK
(
!
name
.
empty
());
CHECK
(
!
name
.
empty
());
ss
<<
id_
;
ss
<<
id_
;
if
(
attrs
.
empty
())
{
ss
<<
"[label="
<<
'"'
<<
name
<<
'"'
<<
"]"
;
return
ss
.
str
();
}
for
(
size_t
i
=
0
;
i
<
attrs
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
attrs
.
size
();
i
++
)
{
if
(
i
==
0
)
{
if
(
i
==
0
)
{
ss
<<
"[label="
<<
'"'
<<
name
<<
'"'
<<
" "
;
ss
<<
"[label="
<<
'"'
<<
name
<<
'"'
<<
" "
;
...
@@ -108,9 +112,11 @@ class Dot {
...
@@ -108,9 +112,11 @@ class Dot {
explicit
Dot
(
const
std
::
vector
<
Attr
>&
attrs
)
:
attrs_
(
attrs
)
{}
explicit
Dot
(
const
std
::
vector
<
Attr
>&
attrs
)
:
attrs_
(
attrs
)
{}
void
AddNode
(
const
std
::
string
&
name
,
const
std
::
vector
<
Attr
>&
attrs
)
{
void
AddNode
(
const
std
::
string
&
id
,
const
std
::
vector
<
Attr
>&
attrs
,
CHECK
(
!
nodes_
.
count
(
name
))
<<
"duplicate Node '"
<<
name
<<
"'"
;
std
::
string
label
=
""
)
{
nodes_
.
emplace
(
name
,
Node
{
name
,
attrs
});
CHECK
(
!
nodes_
.
count
(
id
))
<<
"duplicate Node '"
<<
id
<<
"'"
;
if
(
label
.
empty
())
label
=
id
;
nodes_
.
emplace
(
id
,
Node
{
label
,
attrs
});
}
}
void
AddEdge
(
const
std
::
string
&
source
,
const
std
::
string
&
target
,
void
AddEdge
(
const
std
::
string
&
source
,
const
std
::
string
&
target
,
...
...
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
浏览文件 @
b98b7440
...
@@ -13,3 +13,47 @@
...
@@ -13,3 +13,47 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
FluidToIrPass
::
EnableParamModify
(
const
std
::
string
&
model_dir
,
const
std
::
string
&
prog_file
,
const
std
::
string
&
param_file
)
{
PADDLE_ENFORCE
(
argument_
);
argument_
->
Set
(
"param_scope"
,
new
framework
::
Scope
);
// Load parameters.
VLOG
(
3
)
<<
"Loading parameters from "
<<
model_dir
;
LoadParams
(
&
argument_
->
Get
<
framework
::
Scope
>
(
"param_scope"
),
model_dir
,
prog_file
,
param_file
);
}
bool
FluidToIrPass
::
LoadParams
(
framework
::
Scope
*
scope
,
const
std
::
string
&
dir
,
const
std
::
string
&
prog_file
,
const
std
::
string
&
param_file
)
{
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
framework
::
Executor
executor
(
place
);
PADDLE_ENFORCE
(
argument_
->
origin_program_desc
.
get
());
framework
::
ProgramDesc
program
(
*
argument_
->
origin_program_desc
);
if
((
!
prog_file
.
empty
())
&&
(
!
param_file
.
empty
()))
{
LOG
(
INFO
)
<<
"load single model file from "
<<
prog_file
;
Load
(
&
executor
,
scope
,
prog_file
,
param_file
);
}
else
if
(
!
dir
.
empty
())
{
LOG
(
INFO
)
<<
"load from dir "
<<
dir
;
Load
(
&
executor
,
scope
,
dir
);
}
else
{
LOG
(
ERROR
)
<<
"failed to load parameters"
;
return
false
;
}
return
true
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/fluid_to_ir_pass.h
浏览文件 @
b98b7440
...
@@ -21,12 +21,17 @@ namespace paddle {
...
@@ -21,12 +21,17 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
static
const
char
kFluidToIrPassesAttr
[]
=
"__fluid_to_ir_passes__"
;
class
FluidToIrPass
final
:
public
DataFlowGraphPass
{
class
FluidToIrPass
final
:
public
DataFlowGraphPass
{
public:
public:
FluidToIrPass
()
=
default
;
FluidToIrPass
()
=
default
;
bool
Initialize
(
Argument
*
argument
)
override
{
bool
Initialize
(
Argument
*
argument
)
override
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
PADDLE_ENFORCE
(
argument
->
Has
(
kFluidToIrPassesAttr
),
"argument need the attr %s"
,
kFluidToIrPassesAttr
);
argument_
=
argument
;
if
(
argument
->
origin_program_desc
)
{
if
(
argument
->
origin_program_desc
)
{
LOG
(
WARNING
)
<<
"argument's origin_program_desc is already set, might "
LOG
(
WARNING
)
<<
"argument's origin_program_desc is already set, might "
"duplicate called"
;
"duplicate called"
;
...
@@ -46,12 +51,21 @@ class FluidToIrPass final : public DataFlowGraphPass {
...
@@ -46,12 +51,21 @@ class FluidToIrPass final : public DataFlowGraphPass {
if
(
!
argument
->
main_dfg
)
{
if
(
!
argument
->
main_dfg
)
{
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
}
}
// Persist the ProgramDesc in graph's attribute. The IR graph just keep the
argument
->
Set
(
"ir_program_desc"
,
new
framework
::
ProgramDesc
(
program
));
// address, will segfault if the original ProgramDesc destroys.
auto
&
ir_program_p
=
argument
->
main_dfg
->
Attr
(
"ir_program_desc"
).
Pointer
();
LOG
(
INFO
)
<<
"Loading parameters"
;
ir_program_p
=
new
framework
::
ProgramDesc
(
program
);
// Load parameters to argument if needed.
if
(
argument
->
fluid_model_dir
||
(
argument
->
fluid_model_program_path
&&
argument
->
fluid_model_param_path
))
{
#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : "";
SAFE_GET
(
fluid_model_dir
);
SAFE_GET
(
fluid_model_program_path
);
SAFE_GET
(
fluid_model_param_path
);
#undef SAFE_GET
EnableParamModify
(
fluid_model_dir
,
fluid_model_program_path
,
fluid_model_param_path
);
}
argument_
=
argument
;
return
true
;
return
true
;
}
}
...
@@ -59,20 +73,36 @@ class FluidToIrPass final : public DataFlowGraphPass {
...
@@ -59,20 +73,36 @@ class FluidToIrPass final : public DataFlowGraphPass {
void
Run
(
DataFlowGraph
*
graph
)
override
{
void
Run
(
DataFlowGraph
*
graph
)
override
{
// Call all the IR Passes
// Call all the IR Passes
IRPassManager
ir_passes
(
*
static_cast
<
framework
::
ProgramDesc
*>
(
IRPassManager
ir_passes
(
argument_
->
main_dfg
->
Attr
(
"ir_program_desc"
).
Pointer
()));
argument_
->
Get
<
framework
::
ProgramDesc
>
(
"ir_program_desc"
),
nullptr
);
ir_passes
.
Apply
(
std
::
vector
<
std
::
string
>
(
// Pass the scope from analysis to IR if needed.
{
// Manual update the passes here.
if
(
argument_
->
Has
(
"param_scope"
))
{
"graph_viz_pass"
,
"infer_clean_graph_pass"
,
"graph_viz_pass"
,
// Here the address is passed, attention that IR doesn't own the scope, so
"fc_fuse_pass"
,
"graph_viz_pass"
}));
// the real scope in analysis should live during the IR phase.
ir_passes
.
graph
().
Set
(
"param_scope"
,
new
framework
::
Scope
*
(
&
argument_
->
Get
<
framework
::
Scope
>
(
"param_scope"
)));
}
const
auto
&
ir_passes_to_apply
=
argument_
->
Get
<
std
::
vector
<
std
::
string
>>
(
kFluidToIrPassesAttr
);
ir_passes
.
Apply
(
ir_passes_to_apply
);
PADDLE_ENFORCE
(
argument_
->
main_dfg
.
get
());
PADDLE_ENFORCE
(
argument_
->
main_dfg
.
get
());
argument_
->
main_dfg
->
Build
(
ir_passes
.
graph
());
argument_
->
main_dfg
->
Build
(
ir_passes
.
graph
());
// PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
}
}
void
EnableParamModify
(
const
std
::
string
&
model_dir
,
const
std
::
string
&
prog_file
,
const
std
::
string
&
param_file
);
std
::
string
repr
()
const
override
{
return
"fluid-to-ir-pass"
;
}
std
::
string
repr
()
const
override
{
return
"fluid-to-ir-pass"
;
}
private:
// Load parameters from a single file or from a directory.
bool
LoadParams
(
framework
::
Scope
*
scope
,
const
std
::
string
&
dir
,
const
std
::
string
&
prog_file
,
const
std
::
string
&
param_file
);
private:
private:
Argument
*
argument_
{
nullptr
};
Argument
*
argument_
{
nullptr
};
};
};
...
...
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
浏览文件 @
b98b7440
...
@@ -24,6 +24,8 @@ namespace analysis {
...
@@ -24,6 +24,8 @@ namespace analysis {
TEST
(
FluidToIrPass
,
Test
)
{
TEST
(
FluidToIrPass
,
Test
)
{
FluidToIrPass
pass
;
FluidToIrPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
Argument
argument
(
FLAGS_inference_model_dir
);
argument
.
Set
(
kFluidToIrPassesAttr
,
new
std
::
vector
<
std
::
string
>
({
"infer_clean_graph_pass"
}));
pass
.
Initialize
(
&
argument
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
pass
.
Run
(
argument
.
main_dfg
.
get
());
}
}
...
@@ -32,6 +34,9 @@ TEST(FluidToIrPass, Test) {
...
@@ -32,6 +34,9 @@ TEST(FluidToIrPass, Test) {
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
attention_lstm_fuse_pass
);
USE_PASS
(
fc_lstm_fuse_pass
);
USE_PASS
(
seq_concat_fc_fuse_pass
);
USE_PASS
(
fc_fuse_pass
);
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
b98b7440
...
@@ -14,20 +14,24 @@
...
@@ -14,20 +14,24 @@
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string>
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
IRPassManager
::
IRPassManager
(
const
ProgramDesc
&
program
)
{
IRPassManager
::
IRPassManager
(
const
ProgramDesc
&
program
,
framework
::
Scope
*
scope
)
:
program_
(
program
)
{
graph_
.
reset
(
new
framework
::
ir
::
Graph
(
program
));
graph_
.
reset
(
new
framework
::
ir
::
Graph
(
program
));
if
(
scope
)
graph_
->
Set
(
"param_scope"
,
new
framework
::
Scope
*
(
scope
));
}
}
void
IRPassManager
::
Apply
(
const
std
::
vector
<
std
::
string
>&
passes
)
{
void
IRPassManager
::
Apply
(
const
std
::
vector
<
std
::
string
>
&
passes
)
{
graph_
->
Set
(
"graph_viz_path"
,
new
std
::
string
(
"./1.dot"
));
// Apply all the passes
// Apply all the passes
std
::
string
pre_pass
;
std
::
string
pre_pass
;
for
(
const
std
::
string
&
pass_name
:
passes
)
{
for
(
const
std
::
string
&
pass_name
:
passes
)
{
LOG
(
WARNING
)
<<
"Running IR pass ["
<<
pass_name
<<
"]"
;
LOG
(
WARNING
)
<<
"Running IR pass ["
<<
pass_name
<<
"]"
;
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
if
(
pass_name
==
"graph_viz_pass"
)
{
if
(
pass_name
==
"graph_viz_pass"
)
{
...
...
paddle/fluid/inference/analysis/ir_pass_manager.h
浏览文件 @
b98b7440
...
@@ -23,6 +23,7 @@
...
@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -31,14 +32,15 @@ using framework::ProgramDesc;
...
@@ -31,14 +32,15 @@ using framework::ProgramDesc;
class
IRPassManager
final
{
class
IRPassManager
final
{
public:
public:
IRPassManager
(
const
ProgramDesc
&
program
);
IRPassManager
(
const
ProgramDesc
&
program
,
framework
::
Scope
*
scope
);
void
Apply
(
const
std
::
vector
<
std
::
string
>
&
passes
);
void
Apply
(
const
std
::
vector
<
std
::
string
>
&
passes
);
framework
::
ir
::
Graph
&
graph
()
const
{
return
*
graph_
;
}
framework
::
ir
::
Graph
&
graph
()
const
{
return
*
graph_
;
}
private:
private:
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph_
;
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph_
;
ProgramDesc
program_
;
};
};
}
// namespace analysis
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/pass_manager.cc
浏览文件 @
b98b7440
...
@@ -33,9 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
...
@@ -33,9 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
void
DfgPassManager
::
RunAll
()
{
void
DfgPassManager
::
RunAll
()
{
PADDLE_ENFORCE
(
argument_
);
PADDLE_ENFORCE
(
argument_
);
LOG
(
INFO
)
<<
"Total "
<<
data_
.
size
()
<<
" passes"
;
LOG
(
INFO
)
<<
"Total "
<<
data_
.
size
()
<<
"
Analysys
passes"
;
for
(
auto
&
pass
:
data_
)
{
for
(
auto
&
pass
:
data_
)
{
LOG
(
WARNING
)
<<
"Running pass ["
<<
pass
->
repr
()
<<
"]"
;
LOG
(
WARNING
)
<<
"Running
Analysis
pass ["
<<
pass
->
repr
()
<<
"]"
;
pass
->
Run
(
argument_
->
main_dfg
.
get
());
pass
->
Run
(
argument_
->
main_dfg
.
get
());
}
}
}
}
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -46,7 +46,8 @@ function(inference_api_test TARGET_NAME)
...
@@ -46,7 +46,8 @@ function(inference_api_test TARGET_NAME)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
endfunction
(
inference_api_test
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
)
cc_test
(
test_paddle_inference_api
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
SRCS api_tester.cc
...
...
paddle/fluid/inference/api/analysis_predictor.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace
paddle
{
using
inference
::
analysis
::
Argument
;
using
inference
::
Singleton
;
using
inference
::
analysis
::
Analyzer
;
using
framework
::
proto
::
ProgramDesc
;
/* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor?
*/
class
AnalysisPredictor
:
public
NativePaddlePredictor
{
public:
explicit
AnalysisPredictor
(
const
NativeConfig
&
config
)
:
NativePaddlePredictor
(
config
),
config_
(
config
)
{}
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
VLOG
(
3
)
<<
"Predictor::init()"
;
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
}
PADDLE_ENFORCE
(
!
parent_scope
);
if
(
parent_scope
)
{
scope_
=
parent_scope
;
sub_scope_
=
&
(
parent_scope
->
NewScope
());
}
else
{
paddle
::
framework
::
InitDevices
(
false
);
scope_
.
reset
(
new
paddle
::
framework
::
Scope
());
}
executor_
.
reset
(
new
paddle
::
framework
::
Executor
(
place_
));
// Initialize the inference program
if
(
!
config_
.
model_dir
.
empty
())
{
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
model_dir
);
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_
=
paddle
::
inference
::
Load
(
executor_
.
get
(),
scope_
.
get
(),
config_
.
prog_file
,
config_
.
param_file
);
}
else
{
LOG
(
ERROR
)
<<
"fail to load inference model."
;
return
false
;
}
OptimizeInferenceProgram
();
ctx_
=
executor_
->
Prepare
(
*
inference_program_
,
0
);
VLOG
(
5
)
<<
"to create variables"
;
PADDLE_ENFORCE
(
scope_
.
get
());
executor_
->
CreateVariables
(
*
inference_program_
,
sub_scope_
?
sub_scope_
:
scope_
.
get
(),
0
);
// Get the feed_target_names and fetch_target_names
feed_target_names_
=
inference_program_
->
GetFeedTargetNames
();
fetch_target_names_
=
inference_program_
->
GetFetchTargetNames
();
return
true
;
}
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
override
{
return
NativePaddlePredictor
::
Run
(
inputs
,
output_data
,
batch_size
);
}
void
OptimizeInferenceProgram
()
{
LOG
(
INFO
)
<<
"optimize begin"
;
FLAGS_IA_enable_ir
=
true
;
FLAGS_IA_enable_tensorrt_subgraph_engine
=
false
;
FLAGS_IA_output_storage_path
=
""
;
// Don't output the model.
// Analyze inference_program
Argument
argument
;
if
(
!
config_
.
model_dir
.
empty
())
{
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
config_
.
model_dir
));
}
else
{
PADDLE_ENFORCE
(
!
config_
.
param_file
.
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
.
empty
());
argument
.
fluid_model_program_path
.
reset
(
new
std
::
string
(
config_
.
prog_file
));
argument
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
argument
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
CHECK
(
argument
.
transformed_program_desc
);
VLOG
(
5
)
<<
"to prepare executor"
;
// LOG(INFO) << "transformed_parogram_desc " <<
// argument.transformed_program_desc->DebugString();
inference_program_
.
reset
(
new
framework
::
ProgramDesc
(
*
argument
.
transformed_program_desc
));
PADDLE_ENFORCE
(
argument
.
Has
(
"param_scope"
));
// Update scope.
scope_
.
reset
(
argument
.
Release
<
framework
::
Scope
>
(
"param_scope"
));
LOG
(
INFO
)
<<
"optimize end =="
;
}
private:
NativeConfig
config_
;
};
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
NativeConfig
&
config
)
{
VLOG
(
3
)
<<
"create NativePredictor"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_GT
(
config
.
fraction_of_gpu_memory
,
0.
f
,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"
);
PADDLE_ENFORCE_GE
(
config
.
device
,
0
,
"Invalid device id %d"
,
config
.
device
);
std
::
vector
<
std
::
string
>
flags
;
if
(
config
.
fraction_of_gpu_memory
>=
0.0
f
||
config
.
fraction_of_gpu_memory
<=
0.95
f
)
{
flags
.
push_back
(
"dummpy"
);
std
::
string
flag
=
"--fraction_of_gpu_memory_to_use="
+
std
::
to_string
(
config
.
fraction_of_gpu_memory
);
flags
.
push_back
(
flag
);
VLOG
(
3
)
<<
"set flag: "
<<
flag
;
framework
::
InitGflags
(
flags
);
}
}
std
::
unique_ptr
<
PaddlePredictor
>
predictor
(
new
AnalysisPredictor
(
config
));
if
(
!
dynamic_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
Init
(
nullptr
))
{
return
nullptr
;
}
return
predictor
;
}
}
// namespace paddle
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
b98b7440
...
@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
...
@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
:
NativePaddlePredictor
(
config
),
config_
(
config
)
{}
:
NativePaddlePredictor
(
config
),
config_
(
config
)
{}
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
FLAGS_IA_enable_tensorrt_subgraph_engine
=
true
;
VLOG
(
3
)
<<
"Predictor::init()"
;
VLOG
(
3
)
<<
"Predictor::init()"
;
FLAGS_tensorrt_max_batch_size
=
config_
.
max_batch_size
;
FLAGS_tensorrt_max_batch_size
=
config_
.
max_batch_size
;
FLAGS_tensorrt_workspace_size
=
config_
.
workspace_size
;
FLAGS_tensorrt_workspace_size
=
config_
.
workspace_size
;
...
@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
...
@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER
(
pool2d
);
USE_TRT_CONVERTER
(
pool2d
);
USE_TRT_CONVERTER
(
softmax
);
USE_TRT_CONVERTER
(
softmax
);
USE_TRT_CONVERTER
(
batch_norm
);
USE_TRT_CONVERTER
(
batch_norm
);
USE_TRT_CONVERTER
(
concat
);
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
b98b7440
...
@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
...
@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config1
.
use_gpu
=
true
;
config1
.
use_gpu
=
true
;
config1
.
fraction_of_gpu_memory
=
0.3
;
config1
.
fraction_of_gpu_memory
=
0.3
;
config1
.
device
=
0
;
config1
.
device
=
0
;
config1
.
max_batch_size
=
10
;
auto
predictor0
=
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config0
);
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config0
);
...
...
paddle/fluid/inference/api/helper.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/helper.h"
namespace
paddle
{
namespace
inference
{
template
<
>
std
::
string
to_string
<
std
::
vector
<
float
>>
(
const
std
::
vector
<
std
::
vector
<
float
>>
&
vec
)
{
std
::
stringstream
ss
;
for
(
const
auto
&
piece
:
vec
)
{
ss
<<
to_string
(
piece
)
<<
"
\n
"
;
}
return
ss
.
str
();
}
template
<
>
std
::
string
to_string
<
std
::
vector
<
std
::
vector
<
float
>>>
(
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
&
vec
)
{
std
::
stringstream
ss
;
for
(
const
auto
&
line
:
vec
)
{
for
(
const
auto
&
rcd
:
line
)
{
ss
<<
to_string
(
rcd
)
<<
";
\t
"
;
}
ss
<<
'\n'
;
}
return
ss
.
str
();
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/helper.h
浏览文件 @
b98b7440
...
@@ -44,7 +44,8 @@ class Timer {
...
@@ -44,7 +44,8 @@ class Timer {
}
}
};
};
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
pieces
->
clear
();
pieces
->
clear
();
if
(
str
.
empty
())
{
if
(
str
.
empty
())
{
return
;
return
;
...
@@ -60,7 +61,8 @@ void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
...
@@ -60,7 +61,8 @@ void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
pieces
->
push_back
(
str
.
substr
(
pos
));
pieces
->
push_back
(
str
.
substr
(
pos
));
}
}
}
}
void
split_to_float
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
float
>
*
fs
)
{
static
void
split_to_float
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
float
>
*
fs
)
{
std
::
vector
<
std
::
string
>
pieces
;
std
::
vector
<
std
::
string
>
pieces
;
split
(
str
,
sep
,
&
pieces
);
split
(
str
,
sep
,
&
pieces
);
std
::
transform
(
pieces
.
begin
(),
pieces
.
end
(),
std
::
back_inserter
(
*
fs
),
std
::
transform
(
pieces
.
begin
(),
pieces
.
end
(),
std
::
back_inserter
(
*
fs
),
...
@@ -76,27 +78,14 @@ std::string to_string(const std::vector<T> &vec) {
...
@@ -76,27 +78,14 @@ std::string to_string(const std::vector<T> &vec) {
}
}
template
<
>
template
<
>
std
::
string
to_string
<
std
::
vector
<
float
>>
(
std
::
string
to_string
<
std
::
vector
<
float
>>
(
const
std
::
vector
<
std
::
vector
<
float
>>
&
vec
)
{
const
std
::
vector
<
std
::
vector
<
float
>>
&
vec
);
std
::
stringstream
ss
;
for
(
const
auto
&
piece
:
vec
)
{
ss
<<
to_string
(
piece
)
<<
"
\n
"
;
}
return
ss
.
str
();
}
template
<
>
template
<
>
std
::
string
to_string
<
std
::
vector
<
std
::
vector
<
float
>>>
(
std
::
string
to_string
<
std
::
vector
<
std
::
vector
<
float
>>>
(
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
&
vec
)
{
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
&
vec
);
std
::
stringstream
ss
;
for
(
const
auto
&
line
:
vec
)
{
for
(
const
auto
&
rcd
:
line
)
{
ss
<<
to_string
(
rcd
)
<<
";
\t
"
;
}
ss
<<
'\n'
;
}
return
ss
.
str
();
}
// clang-format off
// clang-format off
void
TensorAssignData
(
PaddleTensor
*
tensor
,
const
std
::
vector
<
std
::
vector
<
float
>>
&
data
)
{
static
void
TensorAssignData
(
PaddleTensor
*
tensor
,
const
std
::
vector
<
std
::
vector
<
float
>>
&
data
)
{
// Assign buffer
// Assign buffer
int
dim
=
std
::
accumulate
(
tensor
->
shape
.
begin
(),
tensor
->
shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
int
dim
=
std
::
accumulate
(
tensor
->
shape
.
begin
(),
tensor
->
shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
tensor
->
data
.
Resize
(
sizeof
(
float
)
*
dim
);
tensor
->
data
.
Resize
(
sizeof
(
float
)
*
dim
);
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
b98b7440
...
@@ -77,6 +77,7 @@ enum class PaddleEngineKind {
...
@@ -77,6 +77,7 @@ enum class PaddleEngineKind {
kNative
=
0
,
// Use the native Fluid facility.
kNative
=
0
,
// Use the native Fluid facility.
kAnakin
,
// Use Anakin for inference.
kAnakin
,
// Use Anakin for inference.
kAutoMixedTensorRT
,
// Automatically mix Fluid with TensorRT.
kAutoMixedTensorRT
,
// Automatically mix Fluid with TensorRT.
kAnalysis
// TODO(Superjomn) support following engines latter.
// TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
...
...
paddle/fluid/inference/io.cc
浏览文件 @
b98b7440
...
@@ -143,5 +143,21 @@ std::unique_ptr<framework::ProgramDesc> Load(
...
@@ -143,5 +143,21 @@ std::unique_ptr<framework::ProgramDesc> Load(
return
main_program
;
return
main_program
;
}
}
void
SaveVars
(
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
vars
,
const
std
::
string
&
dirname
,
bool
predicate
)
{
framework
::
ProgramDesc
prog
;
auto
*
block
=
prog
.
MutableBlock
(
0
);
auto
*
op
=
block
->
AppendOp
();
op
->
SetType
(
"save_combine"
);
op
->
SetInput
(
"X"
,
vars
);
op
->
SetAttr
(
"file_path"
,
dirname
+
"/param"
);
op
->
CheckAttrs
();
platform
::
CPUPlace
place
;
framework
::
Executor
exe
(
place
);
exe
.
Run
(
prog
,
const_cast
<
framework
::
Scope
*>
(
&
scope
),
0
,
true
,
true
);
}
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/io.h
浏览文件 @
b98b7440
...
@@ -41,5 +41,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
...
@@ -41,5 +41,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const
std
::
string
&
prog_filename
,
const
std
::
string
&
prog_filename
,
const
std
::
string
&
param_filename
);
const
std
::
string
&
param_filename
);
// Save the variables from a scope to disk.
void
SaveVars
(
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
vars
,
const
std
::
string
&
dirname
,
bool
predicate
=
true
);
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
b98b7440
# Add TRT tests
# Add TRT tests
nv_library
(
tensorrt_converter
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc
concat_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
...
@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine conv_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine conv_op SERIAL
)
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine elementwise_add_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine elementwise_add_op SERIAL
)
nv_test
(
test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
nv_test
(
test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine softmax_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine softmax_op SERIAL
)
nv_test
(
test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
nv_test
(
test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine batch_norm_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine batch_norm_op SERIAL
)
nv_test
(
test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/concat_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*/
class
ConcatOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid mul op to tensorrt mul layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
std
::
vector
<
nvinfer1
::
ITensor
*>
itensors
;
for
(
auto
&
input_name
:
op_desc
.
Input
(
"X"
))
{
itensors
.
push_back
(
engine_
->
GetITensor
(
input_name
));
}
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
PADDLE_ENFORCE
(
axis
>
0
,
"The axis attr of Concat op should be large than 0 for trt"
);
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Concatenation
,
itensors
.
data
(),
itensors
.
size
());
axis
=
axis
-
1
;
// Remove batch dim
layer
->
setAxis
(
axis
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
concat
,
ConcatOpConverter
);
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
b98b7440
...
@@ -79,6 +79,14 @@ class OpConverter {
...
@@ -79,6 +79,14 @@ class OpConverter {
it
=
it
=
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_tensor"
);
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_tensor"
);
}
}
PADDLE_ENFORCE_NOT_NULL
(
it
,
"no OpConverter for optype [%s]"
,
op_desc
.
Type
());
}
if
(
op_desc
.
Type
()
==
"depthwise_conv2d"
)
{
it
=
Registry
<
OpConverter
>::
Lookup
(
"conv2d"
);
PADDLE_ENFORCE_NOT_NULL
(
it
,
"no OpConverter for optype [%s]"
,
op_desc
.
Type
());
}
}
if
(
!
it
)
{
if
(
!
it
)
{
...
...
paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
concat_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"concat_x1"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
1
));
validator
.
DeclInputVar
(
"concat_x2"
,
nvinfer1
::
DimsCHW
(
3
,
3
,
1
));
validator
.
DeclInputVar
(
"concat_x3"
,
nvinfer1
::
DimsCHW
(
7
,
3
,
1
));
validator
.
DeclOutputVar
(
"concat_out"
,
nvinfer1
::
DimsCHW
(
20
,
3
,
1
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"concat"
);
desc
.
SetInput
(
"X"
,
{
"concat_x1"
,
"concat_x2"
,
"concat_x3"
});
desc
.
SetOutput
(
"Out"
,
{
"concat_out"
});
int
axis
=
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
5
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
concat
);
paddle/fluid/inference/tests/test_helper.h
浏览文件 @
b98b7440
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -135,6 +136,15 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
...
@@ -135,6 +136,15 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return
feed_target_shapes
;
return
feed_target_shapes
;
}
}
void
Compile
(
paddle
::
framework
::
ProgramDesc
*
program
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
g
(
new
paddle
::
framework
::
ir
::
Graph
(
*
program
));
auto
pass
=
paddle
::
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
pass
->
SetNotOwned
<
paddle
::
framework
::
ProgramDesc
>
(
"program"
,
program
);
pass
->
Apply
(
std
::
move
(
g
));
}
template
<
typename
Place
,
bool
CreateVars
=
true
,
bool
PrepareContext
=
false
>
template
<
typename
Place
,
bool
CreateVars
=
true
,
bool
PrepareContext
=
false
>
void
TestInference
(
const
std
::
string
&
dirname
,
void
TestInference
(
const
std
::
string
&
dirname
,
const
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>&
cpu_feeds
,
const
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>&
cpu_feeds
,
...
@@ -172,6 +182,8 @@ void TestInference(const std::string& dirname,
...
@@ -172,6 +182,8 @@ void TestInference(const std::string& dirname,
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
);
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
);
}
}
Compile
(
inference_program
.
get
());
// Disable the profiler and print the timing information
// Disable the profiler and print the timing information
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kDefault
,
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kDefault
,
"load_program_profiler"
);
"load_program_profiler"
);
...
@@ -249,3 +261,5 @@ void TestInference(const std::string& dirname,
...
@@ -249,3 +261,5 @@ void TestInference(const std::string& dirname,
delete
scope
;
delete
scope
;
}
}
USE_PASS
(
graph_to_program_pass
);
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
...
@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
op_library
(
squeeze_op DEPS reshape_op
)
op_library
(
squeeze_op DEPS reshape_op
)
op_library
(
extract_rows_op DEPS memory
)
op_library
(
extract_rows_op DEPS memory
)
op_library
(
flatten_op DEPS reshape_op
)
op_library
(
flatten_op DEPS reshape_op
)
op_library
(
sequence_pad_op DEPS sequence_padding
)
op_library
(
unstack_op DEPS stack_op
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
...
...
paddle/fluid/operators/attention_lstm_op.cc
浏览文件 @
b98b7440
...
@@ -56,7 +56,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -56,7 +56,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
const
int
D
=
w_dims
[
1
]
/
4
;
const
int
D
=
w_dims
[
1
]
/
4
;
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2
,
"Input(LSTMWeight)'s rank must be 2."
);
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2
,
"Input(LSTMWeight)'s rank must be 2."
);
PADDLE_ENFORCE_EQ
(
w_dims
[
0
],
D
+
M
,
PADDLE_ENFORCE_EQ
(
w_dims
[
0
],
D
+
M
,
"LSTMWeight dims should be (%d + %d) * %d."
,
D
+
M
,
4
*
D
);
"LSTMWeight dims should be (%d + %d) * %d."
,
D
,
M
,
4
*
D
);
auto
b_dims
=
ctx
->
GetInputDim
(
"LSTMBias"
);
auto
b_dims
=
ctx
->
GetInputDim
(
"LSTMBias"
);
PADDLE_ENFORCE_EQ
(
b_dims
.
size
(),
2
,
"Input(LSTMBias)'s rank must be 2."
);
PADDLE_ENFORCE_EQ
(
b_dims
.
size
(),
2
,
"Input(LSTMBias)'s rank must be 2."
);
...
...
paddle/fluid/operators/auc_op.h
浏览文件 @
b98b7440
...
@@ -60,20 +60,6 @@ class AucKernel : public framework::OpKernel<T> {
...
@@ -60,20 +60,6 @@ class AucKernel : public framework::OpKernel<T> {
const
T
*
inference_data
=
predict
->
data
<
T
>
();
const
T
*
inference_data
=
predict
->
data
<
T
>
();
const
auto
*
label_data
=
label
->
data
<
int64_t
>
();
const
auto
*
label_data
=
label
->
data
<
int64_t
>
();
// check if states are inited.
auto
*
tp_in
=
ctx
.
Input
<
Tensor
>
(
"TP"
);
auto
*
fp_in
=
ctx
.
Input
<
Tensor
>
(
"FP"
);
auto
*
tn_in
=
ctx
.
Input
<
Tensor
>
(
"TN"
);
auto
*
fn_in
=
ctx
.
Input
<
Tensor
>
(
"FN"
);
PADDLE_ENFORCE
(
tp_in
->
IsInitialized
(),
"true_positive is not inited!"
);
PADDLE_ENFORCE
(
fp_in
->
IsInitialized
(),
"false_negative is not inited!"
);
PADDLE_ENFORCE
(
tn_in
->
IsInitialized
(),
"true_negative is not inited!"
);
PADDLE_ENFORCE
(
fn_in
->
IsInitialized
(),
"false_positive is not inited!"
);
PADDLE_ENFORCE_EQ
(
tp_in
->
numel
(),
num_thresholds
,
""
);
PADDLE_ENFORCE_EQ
(
fp_in
->
numel
(),
num_thresholds
,
""
);
PADDLE_ENFORCE_EQ
(
tn_in
->
numel
(),
num_thresholds
,
""
);
PADDLE_ENFORCE_EQ
(
fn_in
->
numel
(),
num_thresholds
,
""
);
auto
*
tp_data
=
true_positive
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
*
tp_data
=
true_positive
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
*
fn_data
=
false_negative
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
*
fn_data
=
false_negative
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
*
tn_data
=
true_negative
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
*
tn_data
=
true_negative
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
...
...
paddle/fluid/operators/batch_norm_mkldnn_op.cc
浏览文件 @
b98b7440
...
@@ -37,6 +37,95 @@ struct bn_type_traits {
...
@@ -37,6 +37,95 @@ struct bn_type_traits {
using
op_prim
=
typename
op_type
::
primitive_desc
;
using
op_prim
=
typename
op_type
::
primitive_desc
;
};
};
class
BatchNormMKLDNNHandler
:
public
platform
::
MKLDNNHandler
{
public:
BatchNormMKLDNNHandler
(
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
batch_norm_pd
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
)
{
batch_norm_pd_
=
batch_norm_pd
;
}
std
::
shared_ptr
<
memory
>
AcquireScaleshiftMemoryFromPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
batch_norm_pd_
->
weights_primitive_desc
(),
ptr
,
"@scaleshift_mem_p"
);
}
std
::
shared_ptr
<
memory
>
AcquireMeanMemoryFromPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
batch_norm_pd_
->
mean_primitive_desc
(),
ptr
,
"@mean_mem_p"
);
}
std
::
shared_ptr
<
memory
>
AcquireVarianceMemoryFromPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
batch_norm_pd_
->
variance_primitive_desc
(),
ptr
,
"@variance_mem_p"
);
}
std
::
shared_ptr
<
batch_norm_fwd
>
AcquireTestTrainingBatchNormFwd
(
std
::
shared_ptr
<
memory
>
src_memory
,
std
::
shared_ptr
<
memory
>
scaleshift_memory
,
std
::
shared_ptr
<
memory
>
dst_memory
,
std
::
shared_ptr
<
memory
>
mean_memory
,
std
::
shared_ptr
<
memory
>
variance_memory
,
bool
is_test
)
{
auto
prim_key
=
key_
+
"@batch_norm_p"
;
auto
batch_norm_p
=
std
::
static_pointer_cast
<
batch_norm_fwd
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
((
batch_norm_p
!=
nullptr
)
||
!
is_reusing_
,
"Fail to find batch norm primitive in device context"
);
if
(
batch_norm_p
==
nullptr
)
{
if
(
is_test
)
{
batch_norm_p
=
std
::
make_shared
<
batch_norm_fwd
>
(
*
batch_norm_pd_
,
*
src_memory
,
(
const
mkldnn
::
primitive
::
at
&
)
*
mean_memory
,
(
const
mkldnn
::
primitive
::
at
&
)
*
variance_memory
,
*
scaleshift_memory
,
*
dst_memory
);
}
else
{
batch_norm_p
=
std
::
make_shared
<
batch_norm_fwd
>
(
*
batch_norm_pd_
,
*
src_memory
,
*
scaleshift_memory
,
*
dst_memory
,
*
mean_memory
,
*
variance_memory
);
}
dev_ctx_
.
SetBlob
(
prim_key
,
batch_norm_p
);
}
else
{
is_reusing_
=
true
;
}
return
batch_norm_p
;
}
static
std
::
string
GetHash
(
const
memory
::
dims
&
input_dims
,
float
epsilon
,
unsigned
flag
,
bool
is_test
,
memory
::
format
format
,
const
std
::
string
&
suffix
=
""
)
{
auto
dims2str
=
[](
const
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
dstr
+=
std
::
to_string
(
operand_dims
[
i
])
+
"-"
;
}
return
dstr
;
};
return
dims2str
(
input_dims
)
+
std
::
to_string
(
epsilon
)
+
std
::
to_string
(
flag
)
+
std
::
to_string
(
is_test
)
+
std
::
to_string
(
format
)
+
suffix
;
}
private:
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
batch_norm_pd_
;
};
std
::
shared_ptr
<
memory
>
UpdateMemoryData
(
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
const
std
::
string
&
key
,
void
*
new_ptr
)
{
auto
mem
=
std
::
static_pointer_cast
<
memory
>
(
dev_ctx
.
GetBlob
(
key
));
PADDLE_ENFORCE
(
mem
!=
nullptr
,
(
std
::
string
(
"Fail to find memory in device context [key: "
)
+
key
+
"]"
)
.
c_str
());
mem
->
set_data_handle
(
new_ptr
);
return
mem
;
}
template
<
typename
T
,
typename
Container
>
template
<
typename
T
,
typename
Container
>
void
copy_to_weights
(
T
scale_begin
,
T
scale_end
,
T
shift_begin
,
T
shift_end
,
void
copy_to_weights
(
T
scale_begin
,
T
scale_end
,
T
shift_begin
,
T
shift_end
,
Container
*
c
)
{
Container
*
c
)
{
...
@@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
...
@@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
std
::
inserter
(
*
c
,
std
::
next
(
it
,
std
::
distance
(
scale_begin
,
scale_end
))));
std
::
inserter
(
*
c
,
std
::
next
(
it
,
std
::
distance
(
scale_begin
,
scale_end
))));
}
}
template
<
typename
Op
,
typename
...
Args
>
void
run_batch_norm_op
(
Args
&&
...
args
)
{
Op
batch_norm_op
{
args
...};
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
batch_norm_op
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
}
// namespace
}
// namespace
template
<
typename
T
>
template
<
typename
T
>
...
@@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE
(
scale_tz
.
size
()
==
1
,
"Dims of scale tensor is NOT 1"
);
PADDLE_ENFORCE
(
scale_tz
.
size
()
==
1
,
"Dims of scale tensor is NOT 1"
);
const
unsigned
int
ic
=
scale_tz
[
0
];
const
unsigned
int
ic
=
scale_tz
[
0
];
// MKLDNN requires a single piece of memory for scale and shift/bias data
const
size_t
scaleshift_size
=
2
*
ic
;
std
::
vector
<
T
>
scaleshift_data
;
scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
scale
->
data
<
T
>
(),
scale
->
data
<
T
>
()
+
ic
,
shift
->
data
<
T
>
(),
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
unsigned
flags
=
mkldnn
::
use_scale_shift
;
unsigned
flags
=
mkldnn
::
use_scale_shift
;
if
(
is_test
)
flags
|=
mkldnn
::
use_global_stats
;
if
(
is_test
)
flags
|=
mkldnn
::
use_global_stats
;
if
(
fuse_with_relu
)
flags
|=
mkldnn
::
fuse_bn_relu
;
if
(
fuse_with_relu
)
flags
|=
mkldnn
::
fuse_bn_relu
;
...
@@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn
::
memory
::
format
input_format
=
mkldnn
::
memory
::
format
input_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
auto
src_memory
=
memory
(
// keys for backward pass
{{{
src_tz
},
memory
::
data_type
::
f32
,
input_format
},
mkldnn_engine
},
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
to_void_cast
(
x_data
));
src_tz
,
epsilon
,
flags
,
is_test
,
input_format
,
ctx
.
op
().
Output
(
"SavedMean"
));
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
input_format
);
// create primitive descriptor for batch norm forward
// create primitive descriptor for batch norm forward
using
bn_fwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_forward
>
;
using
bn_fwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_forward
>
;
auto
batch_norm_fwd_desc
=
bn_fwd_types
::
op_desc
{
auto
batch_norm_fwd_desc
=
propagation
,
src_memory
.
get_primitive_desc
().
desc
(),
epsilon
,
flags
};
bn_fwd_types
::
op_desc
{
propagation
,
user_src_md
,
epsilon
,
flags
};
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
batch_norm_fwd_pd
=
auto
batch_norm_fwd_pd
=
std
::
make_shared
<
batch_norm_fwd
::
primitive_desc
>
(
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
(
batch_norm_fwd_desc
,
mkldnn_engine
);
new
batch_norm_fwd
::
primitive_desc
(
batch_norm_fwd_desc
,
// Save conv_pd/src_memory/weights_memory for backward pass
mkldnn_engine
));
// Save the pd to be used in backward pass
const
std
::
string
key
=
ctx
.
op
().
Output
(
"SavedMean"
);
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
dev_ctx
.
SetBlob
(
key_batch_norm_fwd_pd
,
batch_norm_fwd_pd
);
dev_ctx
.
SetBlob
(
key_batch_norm_fwd_pd
,
batch_norm_fwd_pd
);
// MKLDNN requires a single piece of memory for scale and shift/bias data
BatchNormMKLDNNHandler
handler
(
batch_norm_fwd_pd
,
dev_ctx
,
mkldnn_engine
,
const
size_t
scaleshift_size
=
2
*
ic
;
key
);
std
::
vector
<
T
>
scaleshift_data
;
scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
scale
->
data
<
T
>
(),
scale
->
data
<
T
>
()
+
ic
,
shift
->
data
<
T
>
(),
auto
src_memory
=
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
handler
.
AcquireSrcMemory
(
user_src_md
,
to_void_cast
(
x_data
)
);
// crate mkldnn memory for weights(scale/shift)
// crate mkldnn memory for weights(scale/shift)
auto
scaleshift_memory
=
memory
(
batch_norm_fwd_pd
->
weights_primitive_desc
(),
auto
scaleshift_memory
=
scaleshift_data
.
data
());
handler
.
AcquireScaleshiftMemoryFromPrimitive
(
scaleshift_data
.
data
());
// create mkldnn memory for output y tensor
// create mkldnn memory for output y tensor
auto
dst_memory
=
memory
(
batch_norm_fwd_pd
->
dst_primitive_desc
(),
y_data
);
auto
dst_memory
=
handler
.
AcquireDstMemory
(
batch_norm_fwd_pd
->
dst_primitive_desc
().
desc
(),
y_data
);
std
::
shared_ptr
<
batch_norm_fwd
>
batch_norm_p
;
if
(
is_test
)
{
if
(
is_test
)
{
// create mkldnn memory for stats (as input)
// create mkldnn memory for stats (as input)
auto
mean_memory
=
memory
(
batch_norm_fwd_pd
->
mean_primitive_desc
(),
std
::
shared_ptr
<
memory
>
mean_memory
=
to_void_cast
(
mean_data
));
handler
.
AcquireMeanMemoryFromPrimitive
(
to_void_cast
(
mean_data
));
auto
variance_memory
=
std
::
shared_ptr
<
memory
>
variance_memory
=
memory
(
batch_norm_fwd_pd
->
variance_primitive_desc
(),
handler
.
AcquireVarianceMemoryFromPrimitive
(
to_void_cast
(
variance_data
));
to_void_cast
(
variance_data
));
run_batch_norm_op
<
typename
bn_fwd_types
::
op_type
>
(
batch_norm_p
=
handler
.
AcquireTestTrainingBatchNormFwd
(
*
batch_norm_fwd_pd
,
src_memory
,
src_memory
,
scaleshift_memory
,
dst_memory
,
mean_memory
,
(
const
mkldnn
::
primitive
::
at
&
)
mean_memory
,
variance_memory
,
true
);
(
const
mkldnn
::
primitive
::
at
&
)
variance_memory
,
scaleshift_memory
,
dst_memory
);
}
else
{
}
else
{
// create mkldnn memory for stats (as output)
// create mkldnn memory for stats (as output)
auto
mean_memory
=
std
::
shared_ptr
<
memory
>
mean_memory
=
memory
(
batch_norm_fwd_pd
->
mean_primitive_desc
(),
batch_mean_data
);
handler
.
AcquireMeanMemoryFromPrimitive
(
batch_mean_data
);
auto
variance_memory
=
memory
(
std
::
shared_ptr
<
memory
>
variance_memory
=
batch_norm_fwd_pd
->
variance_primitive_desc
(),
batch_variance_data
);
handler
.
AcquireVarianceMemoryFromPrimitive
(
batch_variance_data
);
run_batch_norm_op
<
bn_fwd_types
::
op_type
>
(
*
batch_norm_fwd_pd
,
src_memory
,
batch_norm_p
=
handler
.
AcquireTestTrainingBatchNormFwd
(
scaleshift_memory
,
dst
_memory
,
src_memory
,
scaleshift_memory
,
dst_memory
,
mean
_memory
,
mean_memory
,
variance_memory
);
variance_memory
,
false
);
}
}
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
platform
::
GetMKLDNNFormat
(
*
dst_memory
));
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
batch_norm_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
if
(
!
is_test
)
{
if
(
!
is_test
)
{
// mkldnn only compute stats for current batch
// mkldnn only compute stats for current batch
// so we need compute momentum stats via Eigen lib
// so we need compute momentum stats via Eigen lib
...
@@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
running_variance_e
=
running_variance_e
=
variance_e
*
momentum
+
batch_variance_e
*
one_minus_momentum
;
variance_e
*
momentum
+
batch_variance_e
*
one_minus_momentum
;
}
}
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
(
memory
::
format
)
dst_memory
.
get_primitive_desc
().
desc
().
data
.
format
);
}
}
};
};
...
@@ -242,94 +331,169 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -242,94 +331,169 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const
unsigned
int
ic
=
scale_tz
[
0
];
const
unsigned
int
ic
=
scale_tz
[
0
];
// Retrieve bn_fwd_pd from device context
const
std
::
string
key
=
ctx
.
op
().
Input
(
"SavedMean"
);
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
auto
batch_norm_fwd_pd
=
std
::
static_pointer_cast
<
batch_norm_fwd
::
primitive_desc
>
(
dev_ctx
.
GetBlob
(
key_batch_norm_fwd_pd
));
PADDLE_ENFORCE
(
batch_norm_fwd_pd
!=
nullptr
,
"Fail to find batch_norm_fwd_pd in device context"
);
using
bn_bwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_backward
>
;
using
bn_bwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_backward
>
;
// create mkldnn memory from input diff_y tensor
mkldnn
::
memory
::
format
dst_format
=
mkldnn
::
memory
::
format
dst_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
diff_y
->
format
());
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
diff_y
->
format
());
mkldnn
::
memory
::
format
input_format
=
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
unsigned
flags
=
mkldnn
::
use_scale_shift
;
// keys from forward pass
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
false
,
input_format
,
ctx
.
op
().
Input
(
"SavedMean"
));
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
// keys for primitives reuse
const
std
::
string
key_with_hash
=
key
+
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
false
,
input_format
);
const
std
::
string
key_batch_norm_bwd_p
=
key_with_hash
+
"@batch_norm_bwd_p"
;
const
std
::
string
key_batch_norm_src_mem_p
=
key_with_hash
+
"@batch_norm_bwd_src_mem_p"
;
const
std
::
string
key_batch_norm_mean_mem_p
=
key_with_hash
+
"@batch_norm_bwd_mean_mem_p"
;
const
std
::
string
key_batch_norm_variance_mem_p
=
key_with_hash
+
"@batch_norm_bwd_variance_mem_p"
;
const
std
::
string
key_batch_norm_scaleshift_mem_p
=
key_with_hash
+
"@batch_norm_bwd_scaleshift_mem_p"
;
const
std
::
string
key_batch_norm_diff_scaleshift_mem_p
=
key_with_hash
+
"@batch_norm_bwd_diff_scaleshift_mem_p"
;
const
std
::
string
key_batch_norm_diff_src_mem_p
=
key_with_hash
+
"@batch_norm_bwd_diff_src_mem_p"
;
const
std
::
string
key_batch_norm_diff_dst_mem_p
=
key_with_hash
+
"@batch_norm_bwd_diff_dst_mem_p"
;
primitive
reorder_diff_dst
;
bool
is_diff_dst_reordered
=
false
;
auto
user_diff_dst_memory
=
memory
(
auto
user_diff_dst_memory
=
memory
(
{{{
diff_dst_tz
},
memory
::
data_type
::
f32
,
dst_format
},
mkldnn_engine
},
{{{
diff_dst_tz
},
memory
::
data_type
::
f32
,
dst_format
},
mkldnn_engine
},
to_void_cast
(
diff_y_data
));
to_void_cast
(
diff_y_data
));
// create mkldnn memory from input x tensor
// MKLDNN requires a single piece of memory for scale and shift/bias data
mkldnn
::
memory
::
format
input_format
=
const
size_t
scaleshift_size
=
2
*
ic
;
platform
::
MKLDNNFormatForSize
(
src_tz
.
size
(),
x
->
format
());
auto
src_memory
=
memory
(
std
::
vector
<
T
>
scaleshift_data
;
scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
scale_data
,
scale_data
+
ic
,
shift_data
,
shift_data
+
ic
,
&
scaleshift_data
);
std
::
vector
<
T
>
diff_scaleshift_data
;
diff_scaleshift_data
.
reserve
(
scaleshift_size
);
auto
batch_norm_fwd_pd
=
std
::
static_pointer_cast
<
batch_norm_fwd
::
primitive_desc
>
(
dev_ctx
.
GetBlob
(
key_batch_norm_fwd_pd
));
PADDLE_ENFORCE
(
batch_norm_fwd_pd
!=
nullptr
,
"Fail to find batch_norm_fwd_pd in device context"
);
auto
batch_norm_bwd_p
=
std
::
static_pointer_cast
<
batch_norm_bwd
>
(
dev_ctx
.
GetBlob
(
key_batch_norm_bwd_p
));
if
(
batch_norm_bwd_p
==
nullptr
)
{
auto
src_memory
=
std
::
shared_ptr
<
memory
>
(
new
memory
(
{{{
src_tz
},
memory
::
data_type
::
f32
,
input_format
},
mkldnn_engine
},
{{{
src_tz
},
memory
::
data_type
::
f32
,
input_format
},
mkldnn_engine
},
to_void_cast
(
x_data
));
to_void_cast
(
x_data
)
));
// for diff_dst, try to use same format as dst in forward pass
// for diff_dst, try to use same format as dst in forward pass
auto
diff_dst_pd
=
batch_norm_fwd_pd
.
get
()
->
dst_primitive_desc
();
auto
diff_dst_pd
=
batch_norm_fwd_pd
.
get
()
->
dst_primitive_desc
();
auto
diff_dst_md
=
diff_dst_pd
.
desc
();
auto
diff_dst_md
=
diff_dst_pd
.
desc
();
// create primitive descriptor for batch norm backward
// create primitive descriptor for batch norm backward
unsigned
flags
=
mkldnn
::
use_scale_shift
;
auto
batch_norm_bwd_desc
=
bn_bwd_types
::
op_desc
{
auto
batch_norm_bwd_desc
=
bn_bwd_types
::
op_desc
{
mkldnn
::
prop_kind
::
backward
,
diff_dst_md
,
mkldnn
::
prop_kind
::
backward
,
diff_dst_md
,
src_memory
.
get_primitive_desc
().
desc
(),
epsilon
,
flags
};
src_memory
->
get_primitive_desc
().
desc
(),
epsilon
,
flags
};
auto
batch_norm_bwd_pd
=
bn_bwd_types
::
op_prim
{
auto
batch_norm_bwd_pd
=
bn_bwd_types
::
op_prim
{
batch_norm_bwd_desc
,
mkldnn_engine
,
*
batch_norm_fwd_pd
};
batch_norm_bwd_desc
,
mkldnn_engine
,
*
batch_norm_fwd_pd
};
// reorder user_diff_dst if it's not in preferred format
// reorder user_diff_dst if it's not in preferred format
auto
diff_dst_memory
=
user_diff_dst_memory
;
auto
diff_dst_memory
=
std
::
make_shared
<
memory
>
(
user_diff_dst_memory
);
primitive
reorder_diff_dst
;
bool
is_diff_dst_reordered
=
false
;
if
(
diff_dst_pd
!=
user_diff_dst_memory
.
get_primitive_desc
())
{
if
(
diff_dst_pd
!=
user_diff_dst_memory
.
get_primitive_desc
())
{
diff_dst_memory
=
memory
(
diff_dst_pd
);
diff_dst_memory
=
std
::
make_shared
<
memory
>
(
diff_dst_pd
);
reorder_diff_dst
=
reorder
(
user_diff_dst_memory
,
diff_dst_memory
);
reorder_diff_dst
=
reorder
(
user_diff_dst_memory
,
*
diff_dst_memory
);
is_diff_dst_reordered
=
true
;
is_diff_dst_reordered
=
true
;
}
}
// create mkldnn memory for input tensors (src/mean/variance)
// create mkldnn memory for input tensors (src/mean/variance)
auto
mean_memory
=
memory
(
batch_norm_bwd_pd
.
mean_primitive_desc
(),
auto
mean_memory
=
std
::
make_shared
<
memory
>
(
batch_norm_bwd_pd
.
mean_primitive_desc
(),
to_void_cast
(
batch_mean_data
));
to_void_cast
(
batch_mean_data
));
auto
variance_memory
=
memory
(
batch_norm_bwd_pd
.
variance_primitive_desc
(),
auto
variance_memory
=
std
::
make_shared
<
memory
>
(
batch_norm_bwd_pd
.
variance_primitive_desc
(),
to_void_cast
(
batch_variance_data
));
to_void_cast
(
batch_variance_data
));
// MKLDNN requires a single piece of memory for scale and shift/bias data
const
size_t
scaleshift_size
=
2
*
ic
;
std
::
vector
<
T
>
scaleshift_data
;
scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
scale_data
,
scale_data
+
ic
,
shift_data
,
shift_data
+
ic
,
&
scaleshift_data
);
// create mkldnn memory for input tensors (scale/shift)
// create mkldnn memory for input tensors (scale/shift)
auto
scaleshift_memory
=
memory
(
batch_norm_bwd_pd
.
weights_primitive_desc
(),
auto
scaleshift_memory
=
std
::
make_shared
<
memory
>
(
scaleshift_data
.
data
());
batch_norm_bwd_pd
.
weights_primitive_desc
(),
scaleshift_data
.
data
());
// create mkldnn memory for output diff weights (combined scale/shift)
// create mkldnn memory for output diff weights (combined scale/shift)
std
::
vector
<
T
>
diff_scaleshift_data
;
auto
diff_scaleshift_memory
=
std
::
make_shared
<
memory
>
(
diff_scaleshift_data
.
reserve
(
scaleshift_size
);
batch_norm_bwd_pd
.
diff_weights_primitive_desc
(),
auto
diff_scaleshift_memory
=
memory
(
batch_norm_bwd_pd
.
diff_weights_primitive_desc
(),
diff_scaleshift_data
.
data
());
diff_scaleshift_data
.
data
());
// here assume diff_src is in the same format of src
// here assume diff_src is in the same format of src
auto
diff_src_memory
=
memory
(
src_memory
.
get_primitive_desc
(),
diff_x_data
);
auto
diff_src_memory
=
std
::
make_shared
<
memory
>
(
src_memory
->
get_primitive_desc
(),
diff_x_data
);
// finally create batch_norm backward primitive
// finally create batch_norm backward primitive
auto
batch_norm_bwd_prim
=
batch_norm_bwd_p
=
std
::
make_shared
<
batch_norm_bwd
>
(
batch_norm_bwd
(
batch_norm_bwd_pd
,
src_memory
,
mean_memory
,
batch_norm_bwd_pd
,
*
src_memory
,
*
mean_memory
,
*
variance_memory
,
variance_memory
,
diff_dst_memory
,
scaleshift_memory
,
*
diff_dst_memory
,
*
scaleshift_memory
,
*
diff_src_memory
,
diff_src_memory
,
diff_scaleshift_memory
);
*
diff_scaleshift_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_bwd_p
,
batch_norm_bwd_p
);
dev_ctx
.
SetBlob
(
key_batch_norm_src_mem_p
,
src_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_mean_mem_p
,
mean_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_variance_mem_p
,
variance_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_scaleshift_mem_p
,
scaleshift_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_diff_scaleshift_mem_p
,
diff_scaleshift_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_diff_src_mem_p
,
diff_src_memory
);
dev_ctx
.
SetBlob
(
key_batch_norm_diff_dst_mem_p
,
diff_dst_memory
);
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
->
get_primitive_desc
()
.
desc
()
.
data
.
format
);
}
else
{
// primitives already exist
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_src_mem_p
,
to_void_cast
(
x_data
));
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_mean_mem_p
,
to_void_cast
(
batch_mean_data
));
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_variance_mem_p
,
to_void_cast
(
batch_variance_data
));
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_scaleshift_mem_p
,
scaleshift_data
.
data
());
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_diff_scaleshift_mem_p
,
diff_scaleshift_data
.
data
());
auto
diff_src_memory
=
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_diff_src_mem_p
,
to_void_cast
(
diff_x_data
));
auto
diff_dst_memory
=
UpdateMemoryData
(
dev_ctx
,
key_batch_norm_diff_dst_mem_p
,
to_void_cast
(
diff_y_data
));
// reorder user_diff_dst if it's not in preferred format
if
(
diff_dst_memory
->
get_primitive_desc
()
!=
user_diff_dst_memory
.
get_primitive_desc
())
{
reorder_diff_dst
=
reorder
(
user_diff_dst_memory
,
*
diff_dst_memory
);
is_diff_dst_reordered
=
true
;
}
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
->
get_primitive_desc
()
.
desc
()
.
data
.
format
);
}
// execute optional reorder and batch_norm backward primitive
// execute optional reorder and batch_norm backward primitive
std
::
vector
<
primitive
>
pipeline
;
std
::
vector
<
primitive
>
pipeline
;
if
(
is_diff_dst_reordered
)
pipeline
.
push_back
(
reorder_diff_dst
);
if
(
is_diff_dst_reordered
)
pipeline
.
push_back
(
reorder_diff_dst
);
pipeline
.
push_back
(
batch_norm_bwd_prim
);
pipeline
.
push_back
(
*
batch_norm_bwd_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// copy back diff sacle/shift to output tensors (diff scale/shift)
// copy back diff sacle/shift to output tensors (diff scale/shift)
...
@@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std
::
copy
(
it
,
std
::
next
(
it
,
ic
),
diff_scale_data
);
std
::
copy
(
it
,
std
::
next
(
it
,
ic
),
diff_scale_data
);
std
::
copy
(
std
::
next
(
it
,
ic
),
std
::
end
(
diff_scaleshift_data
),
std
::
copy
(
std
::
next
(
it
,
ic
),
std
::
end
(
diff_scaleshift_data
),
diff_shift_data
);
diff_shift_data
);
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
.
get_primitive_desc
()
.
desc
()
.
data
.
format
);
}
}
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -29,6 +29,6 @@ target_assign_op.cu)
...
@@ -29,6 +29,6 @@ target_assign_op.cu)
detection_library
(
polygon_box_transform_op SRCS polygon_box_transform_op.cc
detection_library
(
polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu
)
polygon_box_transform_op.cu
)
detection_library
(
rpn_target_assign_op SRCS rpn_target_assign_op.cc
)
detection_library
(
rpn_target_assign_op SRCS rpn_target_assign_op.cc
)
detection_library
(
generate_proposals_op SRCS generate_proposals_op.cc
)
#
Export local libraries to parent
#Export local libraries to parent
set
(
DETECTION_LIBRARY
${
LOCAL_DETECTION_LIBS
}
PARENT_SCOPE
)
set
(
DETECTION_LIBRARY
${
LOCAL_DETECTION_LIBS
}
PARENT_SCOPE
)
paddle/fluid/operators/detection/generate_proposals_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
struct
AppendProposalsFunctor
{
LoDTensor
*
out_
;
int64_t
offset_
;
Tensor
*
to_add_
;
AppendProposalsFunctor
(
LoDTensor
*
out
,
int64_t
offset
,
Tensor
*
to_add
)
:
out_
(
out
),
offset_
(
offset
),
to_add_
(
to_add
)
{}
template
<
typename
T
>
void
operator
()()
const
{
auto
*
out_data
=
out_
->
data
<
T
>
();
auto
*
to_add_data
=
to_add_
->
data
<
T
>
();
memcpy
(
out_data
+
offset_
,
to_add_data
,
to_add_
->
numel
()
*
sizeof
(
T
));
}
};
class
GenerateProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scores"
),
"Input(Scores) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BboxDeltas"
),
"Input(BboxDeltas) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"ImInfo"
),
"Input(ImInfo) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Anchors"
),
"Input(Anchors) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variances"
),
"Input(Variances) shouldn't be null."
);
auto
scores_dims
=
ctx
->
GetInputDim
(
"Scores"
);
auto
bbox_deltas_dims
=
ctx
->
GetInputDim
(
"BboxDeltas"
);
auto
im_info_dims
=
ctx
->
GetInputDim
(
"ImInfo"
);
auto
anchors_dims
=
ctx
->
GetInputDim
(
"Anchors"
);
auto
variances_dims
=
ctx
->
GetInputDim
(
"Variances"
);
ctx
->
SetOutputDim
(
"RpnRois"
,
{
-
1
,
4
});
ctx
->
SetOutputDim
(
"RpnRoiProbs"
,
{
-
1
,
1
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Anchors"
)
->
type
()),
platform
::
CPUPlace
());
}
};
template
<
class
T
>
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
row
=
all_anchors
->
dims
()[
0
];
int64_t
len
=
all_anchors
->
dims
()[
1
];
auto
*
bbox_deltas_data
=
bbox_deltas
->
data
<
T
>
();
auto
*
anchor_data
=
all_anchors
->
data
<
T
>
();
const
T
*
variances_data
=
nullptr
;
if
(
variances
)
{
variances_data
=
variances
->
data
<
T
>
();
}
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
T
anchor_width
=
anchor_data
[
i
*
len
+
2
]
-
anchor_data
[
i
*
len
];
T
anchor_height
=
anchor_data
[
i
*
len
+
3
]
-
anchor_data
[
i
*
len
+
1
];
T
anchor_center_x
=
(
anchor_data
[
i
*
len
+
2
]
+
anchor_data
[
i
*
len
])
/
2
;
T
anchor_center_y
=
(
anchor_data
[
i
*
len
+
3
]
+
anchor_data
[
i
*
len
+
1
])
/
2
;
T
bbox_center_x
=
0
,
bbox_center_y
=
0
;
T
bbox_width
=
0
,
bbox_height
=
0
;
if
(
variances
)
{
bbox_center_x
=
variances_data
[
i
*
len
]
*
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
variances_data
[
i
*
len
+
1
]
*
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
])
*
anchor_width
;
bbox_height
=
std
::
exp
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
])
*
anchor_height
;
}
else
{
bbox_center_x
=
bbox_deltas_data
[
i
*
len
]
*
anchor_width
+
anchor_center_x
;
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
bbox_deltas_data
[
i
*
len
+
2
])
*
anchor_width
;
bbox_height
=
std
::
exp
(
bbox_deltas_data
[
i
*
len
+
3
])
*
anchor_height
;
}
proposals_data
[
i
*
len
]
=
bbox_center_x
-
bbox_width
/
2
;
proposals_data
[
i
*
len
+
1
]
=
bbox_center_y
-
bbox_height
/
2
;
proposals_data
[
i
*
len
+
2
]
=
bbox_center_x
+
bbox_width
/
2
;
proposals_data
[
i
*
len
+
3
]
=
bbox_center_y
+
bbox_height
/
2
;
}
// return proposals;
}
template
<
class
T
>
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
}
}
}
template
<
class
T
>
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
min_size
*=
im_info_data
[
2
];
keep
->
Resize
({
boxes
->
dims
()[
0
],
1
});
int
*
keep_data
=
keep
->
mutable_data
<
int
>
(
ctx
.
GetPlace
());
int
keep_len
=
0
;
for
(
int
i
=
0
;
i
<
boxes
->
dims
()[
0
];
++
i
)
{
T
ws
=
boxes_data
[
4
*
i
+
2
]
-
boxes_data
[
4
*
i
]
+
1
;
T
hs
=
boxes_data
[
4
*
i
+
3
]
-
boxes_data
[
4
*
i
+
1
]
+
1
;
T
x_ctr
=
boxes_data
[
4
*
i
]
+
ws
/
2
;
T
y_ctr
=
boxes_data
[
4
*
i
+
1
]
+
hs
/
2
;
if
(
ws
>=
min_size
&&
hs
>=
min_size
&&
x_ctr
<=
im_info_data
[
1
]
&&
y_ctr
<=
im_info_data
[
0
])
{
keep_data
[
keep_len
++
]
=
i
;
}
}
keep
->
Resize
({
keep_len
});
}
bool
SortScorePairDescend
(
const
std
::
pair
<
float
,
int
>
&
pair1
,
const
std
::
pair
<
float
,
int
>
&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
class
T
>
void
GetMaxScoreIndex
(
const
std
::
vector
<
T
>
&
scores
,
std
::
vector
<
std
::
pair
<
T
,
int
>>
*
sorted_indices
)
{
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
));
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
->
begin
(),
sorted_indices
->
end
(),
SortScorePairDescend
);
}
template
<
class
T
>
T
BBoxArea
(
const
T
*
box
,
const
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
class
T
>
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
const
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
inter_xmin
=
std
::
max
(
box1
[
0
],
box2
[
0
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
inter_xmax
-
inter_xmin
;
const
T
inter_h
=
inter_ymax
-
inter_ymin
;
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
return
inter_area
/
(
bbox1_area
+
bbox2_area
-
inter_area
);
}
}
template
<
class
T
>
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
const
T
nms_threshold
,
const
float
eta
)
{
PADDLE_ENFORCE_NOT_NULL
(
bbox
);
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
GetMaxScoreIndex
<
T
>
(
scores_data
,
&
sorted_indices
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
bool
flag
;
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
front
().
second
;
flag
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
.
size
();
++
k
)
{
if
(
flag
)
{
const
int
kept_idx
=
selected_indices
[
k
];
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
}
else
{
break
;
}
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
selected_num
++
;
}
sorted_indices
.
erase
(
sorted_indices
.
begin
());
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
int
*
keep_data
=
keep_nms
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
typename
DeviceContext
,
typename
T
>
class
GenerateProposalsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
int
pre_nms_top_n
=
context
.
Attr
<
int
>
(
"pre_nms_topN"
);
int
post_nms_top_n
=
context
.
Attr
<
int
>
(
"post_nms_topN"
);
float
nms_thresh
=
context
.
Attr
<
float
>
(
"nms_thresh"
);
float
min_size
=
context
.
Attr
<
float
>
(
"min_size"
);
float
eta
=
context
.
Attr
<
float
>
(
"eta"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
context
.
GetPlace
());
rpn_roi_probs
->
mutable_data
<
T
>
({
scores
->
numel
()
/
4
,
1
},
context
.
GetPlace
());
Tensor
bbox_deltas_swap
,
scores_swap
;
bbox_deltas_swap
.
mutable_data
<
T
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
},
dev_ctx
.
GetPlace
());
scores_swap
.
mutable_data
<
T
>
({
num
,
h_score
,
w_score
,
c_score
},
dev_ctx
.
GetPlace
());
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans
;
std
::
vector
<
int
>
axis
=
{
0
,
2
,
3
,
1
};
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
);
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
});
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
Tensor
im_info_slice
=
im_info
->
Slice
(
i
,
i
+
1
);
Tensor
bbox_deltas_slice
=
bbox_deltas_swap
.
Slice
(
i
,
i
+
1
);
Tensor
scores_slice
=
scores_swap
.
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
h_bbox
*
w_bbox
*
c_bbox
/
4
,
4
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
tensor_pair
.
first
;
Tensor
scores
=
tensor_pair
.
second
;
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_rois
->
type
()),
AppendProposalsFunctor
(
rpn_rois
,
4
*
num_proposals
,
&
proposals
));
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_roi_probs
->
type
()),
AppendProposalsFunctor
(
rpn_roi_probs
,
num_proposals
,
&
scores
));
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
emplace_back
(
num_proposals
);
}
lod
.
emplace_back
(
lod0
);
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
rpn_roi_probs
->
Resize
({
num_proposals
,
1
});
}
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
const
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
std
::
function
<
bool
(
const
int64_t
&
,
const
int64_t
&
)
>
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
if
(
pre_nms_top_n
<=
0
||
pre_nms_top_n
>=
scores_slice
.
numel
())
{
std
::
sort
(
index
,
index
+
scores_slice
.
numel
(),
compare
);
}
else
{
std
::
nth_element
(
index
,
index
+
pre_nms_top_n
,
index
+
scores_slice
.
numel
(),
compare
);
index_t
.
Resize
({
pre_nms_top_n
});
}
Tensor
scores_sel
,
bbox_sel
,
anchor_sel
,
var_sel
;
scores_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
1
},
ctx
.
GetPlace
());
bbox_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
},
ctx
.
GetPlace
());
anchor_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
},
ctx
.
GetPlace
());
var_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
},
ctx
.
GetPlace
());
CPUGather
<
T
>
(
ctx
,
scores_slice
,
index_t
,
&
scores_sel
);
CPUGather
<
T
>
(
ctx
,
bbox_deltas_slice
,
index_t
,
&
bbox_sel
);
CPUGather
<
T
>
(
ctx
,
anchors
,
index_t
,
&
anchor_sel
);
CPUGather
<
T
>
(
ctx
,
variances
,
index_t
,
&
var_sel
);
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
},
ctx
.
GetPlace
());
BoxCoder
<
T
>
(
ctx
,
&
anchor_sel
,
&
bbox_sel
,
&
var_sel
,
&
proposals
);
ClipTiledBoxes
<
T
>
(
ctx
,
im_info_slice
,
&
proposals
);
Tensor
keep
;
FilterBoxes
<
T
>
(
ctx
,
&
proposals
,
min_size
,
im_info_slice
,
&
keep
);
Tensor
scores_filter
;
bbox_sel
.
mutable_data
<
T
>
({
keep
.
numel
(),
4
},
ctx
.
GetPlace
());
scores_filter
.
mutable_data
<
T
>
({
keep
.
numel
(),
1
},
ctx
.
GetPlace
());
CPUGather
<
T
>
(
ctx
,
proposals
,
keep
,
&
bbox_sel
);
CPUGather
<
T
>
(
ctx
,
scores_sel
,
keep
,
&
scores_filter
);
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_sel
);
}
Tensor
keep_nms
=
NMS
<
T
>
(
ctx
,
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
}
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
},
ctx
.
GetPlace
());
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
},
ctx
.
GetPlace
());
CPUGather
<
T
>
(
ctx
,
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
ctx
,
scores_filter
,
keep_nms
,
&
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
}
};
class
GenerateProposalsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Scores"
,
"The scores of anchors should be foreground."
);
AddInput
(
"BboxDeltas"
,
"bbox_deltas."
);
AddInput
(
"ImInfo"
,
"Information for image reshape."
);
AddInput
(
"Anchors"
,
"All anchors."
);
AddInput
(
"Variances"
,
" variances"
);
AddOutput
(
"RpnRois"
,
"Anchors."
);
AddOutput
(
"RpnRoiProbs"
,
"Anchors."
);
AddAttr
<
int
>
(
"pre_nms_topN"
,
"pre_nms_topN"
);
AddAttr
<
int
>
(
"post_nms_topN"
,
"post_nms_topN"
);
AddAttr
<
float
>
(
"nms_thresh"
,
"nms_thres"
);
AddAttr
<
float
>
(
"min_size"
,
"min size"
);
AddAttr
<
float
>
(
"eta"
,
"eta"
);
AddComment
(
R"DOC(
Generate Proposals OP
This operator proposes rois according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
could be used to train detection net.
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
of anchors, H and W are height and width of the feature map.
BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
Finally, apply nms to get final proposals as output.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
generate_proposals
,
ops
::
GenerateProposalsOp
,
ops
::
GenerateProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
paddle/fluid/operators/fake_dequantize_op.cc
浏览文件 @
b98b7440
...
@@ -18,15 +18,32 @@ limitations under the License. */
...
@@ -18,15 +18,32 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
template
<
typename
T
>
struct
DequantizeFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
dev_ctx
,
const
framework
::
Tensor
*
in
,
const
framework
::
Tensor
*
scale
,
T
max_range
,
framework
::
Tensor
*
out
)
{
auto
in_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
const
T
*
scale_factor
=
scale
->
data
<
T
>
();
auto
out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
dev
=
*
dev_ctx
.
eigen_device
();
out_e
.
device
(
dev
)
=
(
scale_factor
[
0
]
/
max_range
)
*
in_e
;
}
};
template
struct
DequantizeFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
DequantizeFunctor
<
platform
::
CPUDeviceContext
,
double
>;
class
FakeDequantizeMaxAbsOp
:
public
framework
::
OperatorWithKernel
{
class
FakeDequantizeMaxAbsOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
FakeDequantizeMaxAbsOp
(
const
std
::
string
&
type
,
FakeDequantizeMaxAbsOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
const
framework
::
AttributeMap
&
attrs
)
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of FakeDequantizeMaxAbsOp should not be null."
);
"Input(X) of FakeDequantizeMaxAbsOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
...
@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"X"
,
AddInput
(
"X"
,
"(Tensor) The input with float-32/64 type is the "
"(Tensor) The input with float-32/64 type is the "
"low precision tensor."
);
"low precision tensor."
);
AddInput
(
"Scale"
,
"(float) The scale in quantization stage."
);
AddOutput
(
"Out"
,
AddOutput
(
"Out"
,
"(Tensor) The output is the dequantized high "
"(Tensor) The output is the dequantized high "
"precision tensor."
);
"precision tensor."
);
AddAttr
<
int
>
(
"num_bits"
,
AddAttr
<
float
>
(
"max_range"
,
"(float) The max range in quantization stage."
);
"(int) `num_bits` is the quantization level bits, "
"such as 2, 5, 8."
);
AddAttr
<
float
>
(
"scale"
,
"(float) The maximum absolute value of low precision tensor."
"It is usually calculated by the fake_quantize_max_abs_op."
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
FakeDequantizeMaxAbsOp operator.
FakeDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
$$Out = \frac{scale*X}{
2^{num_bits} - 1
}$$
$$Out = \frac{scale*X}{
max_range
}$$
)DOC"
);
)DOC"
);
}
}
...
...
paddle/fluid/operators/fake_dequantize_op.cu
浏览文件 @
b98b7440
...
@@ -14,6 +14,42 @@ limitations under the License. */
...
@@ -14,6 +14,42 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h"
#include "paddle/fluid/operators/fake_dequantize_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
KeDequantize
(
const
T
*
in
,
const
T
*
scale
,
T
max_range
,
int
num
,
T
*
out
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
num
)
{
out
[
idx
]
=
in
[
idx
]
*
scale
[
0
]
/
max_range
;
}
}
template
<
typename
T
>
struct
DequantizeFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
const
framework
::
Tensor
*
in
,
const
framework
::
Tensor
*
scale
,
T
max_range
,
framework
::
Tensor
*
out
)
{
const
T
*
in_data
=
in
->
data
<
T
>
();
const
T
*
scale_factor
=
scale
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
int
num
=
in
->
numel
();
int
block
=
512
;
int
grid
=
(
num
+
block
-
1
)
/
block
;
KeDequantize
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
in_data
,
scale_factor
,
max_range
,
num
,
out_data
);
}
};
template
struct
DequantizeFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
DequantizeFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
using
CUDA
=
paddle
::
platform
::
CUDADeviceContext
;
using
CUDA
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
fake_dequantize_max_abs
,
REGISTER_OP_CUDA_KERNEL
(
fake_dequantize_max_abs
,
...
...
paddle/fluid/operators/fake_dequantize_op.h
浏览文件 @
b98b7440
...
@@ -19,22 +19,29 @@ limitations under the License. */
...
@@ -19,22 +19,29 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
struct
DequantizeFunctor
{
void
operator
()(
const
DeviceContext
&
dev_ctx
,
const
framework
::
Tensor
*
in
,
const
framework
::
Tensor
*
scale
,
T
max_range
,
framework
::
Tensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
FakeDequantizeMaxAbsKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FakeDequantizeMaxAbsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
scale
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Scale"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
in
->
place
());
int
num_bits
=
ctx
.
Attr
<
int
>
(
"num_bits"
);
float
max_range
=
ctx
.
Attr
<
float
>
(
"max_range"
);
T
scale
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"scale"
));
int
range
=
std
::
pow
(
2
,
num_bits
)
-
1
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
out
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
auto
eigen_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
DequantizeFunctor
<
DeviceContext
,
T
>
()(
dev_ctx
,
in
,
scale
,
auto
eigen_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
static_cast
<
T
>
(
max_range
),
out
);
auto
&
dev
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
eigen_out
.
device
(
dev
)
=
(
scale
/
range
)
*
eigen_in
;
}
}
};
};
...
...
paddle/fluid/operators/fetch_barrier_op.cc
浏览文件 @
b98b7440
...
@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
...
@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
class
FetchBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
FetchBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
{
void
Make
()
{
AddOutput
(
"Out"
,
"(Any) Dummy outputs, used for control dependency"
)
.
AsDuplicable
();
AddComment
(
R"DOC(
AddComment
(
R"DOC(
SendBarrier operator
SendBarrier operator
...
...
paddle/fluid/operators/fusion_gru_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_gru_op.h"
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
namespace
operators
{
void
FusionGRUOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"WeightX"
),
"Input(WeightX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"WeightH"
),
"Input(WeightH) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedGate"
),
"Output(BatchedGate) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchResetHiddenPrev"
),
"Output(BatchResetHiddenPrev) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedHidden"
),
"Output(BatchedHidden) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
"Output(Hidden) of GRU should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2
,
"Input(X)'s rank must be 2."
);
auto
wx_dims
=
ctx
->
GetInputDim
(
"WeightX"
);
PADDLE_ENFORCE_EQ
(
wx_dims
.
size
(),
2
,
"The rank of Input(WeightX) should be 2."
);
PADDLE_ENFORCE_EQ
(
wx_dims
[
0
],
x_dims
[
1
],
"The first dimension of Input(WeightX) "
"should be %d."
,
x_dims
[
1
]);
int
frame_size
=
wx_dims
[
1
]
/
3
;
auto
wh_dims
=
ctx
->
GetInputDim
(
"WeightH"
);
PADDLE_ENFORCE_EQ
(
wh_dims
.
size
(),
2
,
"The rank of Input(WeightH) should be 2."
);
PADDLE_ENFORCE_EQ
(
wh_dims
[
0
],
frame_size
,
"The first dimension of Input(WeightH) "
"should be %d."
,
frame_size
);
PADDLE_ENFORCE_EQ
(
wh_dims
[
1
],
3
*
frame_size
,
"The second dimension of Input(WeightH) "
"should be 3 * %d."
,
frame_size
);
if
(
ctx
->
HasInput
(
"H0"
))
{
auto
h0_dims
=
ctx
->
GetInputDim
(
"H0"
);
PADDLE_ENFORCE_EQ
(
h0_dims
[
1
],
frame_size
,
"The width of H0 must be equal to frame_size."
);
}
if
(
ctx
->
HasInput
(
"Bias"
))
{
auto
b_dims
=
ctx
->
GetInputDim
(
"Bias"
);
PADDLE_ENFORCE_EQ
(
b_dims
.
size
(),
2
,
"The rank of Input(Bias) should be 2."
);
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
1
,
"The first dimension of Input(Bias) should be 1."
);
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
frame_size
*
3
,
"The shape of Bias must be [1, frame_size * 3]."
);
}
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedGate"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedHidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchResetHiddenPrev"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
ShareLoD
(
"X"
,
"XX"
);
}
framework
::
OpKernelType
FusionGRUOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
)
->
type
()),
ctx
.
device_context
());
}
void
FusionGRUOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(LoDTensor) the input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X M), where T is the "
"total time steps in this mini-batch, M is the dim size of x."
);
AddInput
(
"H0"
,
"(Tensor, optional) The initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size, D is the hidden size."
)
.
AsDispensable
();
AddInput
(
"WeightX"
,
"(Tensor) The FC weight with shape (M x 3D),"
"where M is the dim size of x, D is the hidden size. "
);
AddInput
(
"WeightH"
,
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
);
AddInput
(
"Bias"
,
"(Tensor, optional) (1 x 3D)."
"Almost same as GRUOp."
"Note: if have FC bias it should be added on this bias."
)
.
AsDispensable
();
AddOutput
(
"XX"
,
"(LoDTensor) the result after X * WeightX (size is T x 4D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input."
)
.
AsIntermediate
();
AddOutput
(
"BatchedGate"
,
"(LoDTensor) Same as GRUOp"
).
AsIntermediate
();
AddOutput
(
"BatchResetHiddenPrev"
,
"(LoDTensor) (T x 3D) Same as GRUOp."
)
.
AsIntermediate
();
AddOutput
(
"BatchedHidden"
,
"(LoDTensor) (T X D) Same as GRUOp."
)
.
AsIntermediate
();
AddOutput
(
"Hidden"
,
"(LoDTensor) (T x D) Same as GRUOp"
);
AddAttr
<
std
::
string
>
(
"activation"
,
"(string, default tanh) "
"The activation type used for output candidate {h}_t."
)
.
SetDefault
(
"tanh"
);
AddAttr
<
std
::
string
>
(
"gate_activation"
,
"(string, default sigmoid) "
"The activation type used in update gate and reset gate."
)
.
SetDefault
(
"sigmoid"
);
AddAttr
<
bool
>
(
"is_reverse"
,
"(bool, defalut: False) "
"whether to compute reversed GRU."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
more details can refer to GRU op.
)DOC"
);
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
FusionGRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
batched_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedGate"
);
auto
*
batch_reset_hidden_prev
=
ctx
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
auto
*
batch_hidden
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedHidden"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_gate_data
=
batched_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_hidden
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
auto
x_dims
=
x
->
dims
();
auto
wx_dims
=
wx
->
dims
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
if
(
x_dims
[
1
]
>
wx_dims
[
1
])
{
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
x_data
,
wx_data
,
xx_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
to_batch
(
dev_ctx
,
*
xx
,
batched_gate
,
true
,
is_reverse
);
}
else
{
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
batched_gate
->
set_lod
(
xx
->
lod
());
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
xx_data
,
wx_data
,
batched_gate_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
}
int
frame_size
=
static_cast
<
int
>
(
wx_dims
[
1
]
/
3
);
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
wh_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
wh_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batched_gate
->
lod
()[
2
]);
if
(
h0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
ctx
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batched_gate
->
lod
()[
0
];
size_t
seq_len
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if
(
FLAGS_paddle_num_threads
>=
4
)
{
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
*
2
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_gate
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
*
2
,
frame_size
,
T
(
1.0
),
gru_value
.
gate_weight
,
frame_size
*
2
,
packed_gate
);
T
*
packed_state
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_state
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
,
frame_size
,
T
(
1.0
),
gru_value
.
state_weight
,
frame_size
,
packed_state
);
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
*
2
,
frame_size
,
gru_value
.
prev_out_value
,
frame_size
,
packed_gate
,
frame_size
*
2
,
T
(
1
),
gru_value
.
gate_value
,
frame_size
*
3
);
}
math
::
detail
::
forward_reset_output
(
math
::
detail
::
forward
::
gru_resetOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_gate
);
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
,
frame_size
,
gru_value
.
reset_output_value
,
frame_size
,
packed_state
,
frame_size
,
T
(
1
),
gru_value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
);
}
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
blas
.
GEMM_FREE
(
packed_gate
);
blas
.
GEMM_FREE
(
packed_state
);
}
else
{
#endif
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
#ifdef PADDLE_WITH_MKLML
}
#endif
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batched_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden_out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_gru
,
ops
::
FusionGRUOp
,
ops
::
FusionGRUOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
fusion_gru
,
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/fusion_gru_op.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
class
FusionGRUOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusionGRUOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
b98b7440
...
@@ -15,10 +15,14 @@ limitations under the License. */
...
@@ -15,10 +15,14 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool
(
seq_mode
,
true
,
"Use sequence mode"
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Cell"
);
ctx
->
ShareLoD
(
"X"
,
"Cell"
);
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
int
xx_width
;
if
(
FLAGS_seq_mode
)
{
xx_width
=
wx_dims
[
1
];
}
else
{
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
}
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
ShareLoD
(
"X"
,
"XX"
);
ctx
->
ShareLoD
(
"X"
,
"XX"
);
}
}
...
@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
...
@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
act_gate
,
act_cell
,
act_cand
;
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
auto
&
act_cell_str
=
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
);
auto
&
act_cand_str
=
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
);
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
math
::
VecActivations
<
T
,
platform
::
jit
::
avx
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
else
{
math
::
VecActivations
<
T
,
platform
::
jit
::
isa_any
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
auto
x_lod
=
x
->
lod
();
auto
x_dims
=
x
->
dims
();
// T x M
auto
wh_dims
=
wh
->
dims
();
// D x 4D
const
int
total_T
=
x_dims
[
0
];
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
// batch size
const
int
M
=
x_dims
[
1
];
// x frame size
const
int
D
=
wh_dims
[
0
];
const
int
D2
=
D
*
2
;
const
int
D3
=
D
*
3
;
const
int
D4
=
wh_dims
[
1
];
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
h0_data
=
h0
?
h0
->
data
<
T
>
()
:
NULL
;
const
T
*
c0_data
=
c0
?
c0
->
data
<
T
>
()
:
NULL
;
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
cell_out_data
=
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D4
,
M
,
x_data
,
wx_data
,
xx_data
,
bias
->
data
<
T
>
());
int
xx_offset
=
D4
;
int
gate_offset
=
D
;
if
(
is_reverse
)
{
const
int
offset
=
(
total_T
-
1
)
*
D
;
xx_data
=
xx_data
+
offset
*
4
;
hidden_out_data
=
hidden_out_data
+
offset
;
cell_out_data
=
cell_out_data
+
offset
;
xx_offset
=
-
D4
;
gate_offset
=
-
D
;
}
auto
move_step
=
[
&
]()
{
xx_data
=
xx_data
+
xx_offset
;
hidden_out_data
=
hidden_out_data
+
gate_offset
;
cell_out_data
=
cell_out_data
+
gate_offset
;
};
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
const
T
*
prev_cell_data
=
NULL
;
const
T
*
prev_hidden_data
=
NULL
;
int
tstart
=
0
;
if
(
h0_data
)
{
prev_hidden_data
=
h0_data
+
bid
*
D
;
prev_cell_data
=
c0_data
+
bid
*
D
;
}
else
{
// W_ch, W_ih, W_fh, W_oh
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
act_cand
(
D
,
xx_data
,
xx_data
);
// cell out= input*tilde
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D
,
cell_out_data
);
// hidden out= act_state(cellout) * outgate
act_cell
(
D
,
cell_out_data
,
xx_data
+
D2
);
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
// prev
prev_hidden_data
=
hidden_out_data
;
prev_cell_data
=
cell_out_data
;
tstart
=
1
;
move_step
();
}
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D4
,
D
,
static_cast
<
T
>
(
1
),
prev_hidden_data
,
D
,
wh_data
,
D4
,
static_cast
<
T
>
(
1
),
xx_data
,
D4
);
// W_ch, W_ih, W_fh, W_oh
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
act_cand
(
D
,
xx_data
,
xx_data
);
// a = forget * prev_cell
blas
.
VMUL
(
D
,
xx_data
+
D2
,
prev_cell_data
,
xx_data
+
D2
);
// b = input * tilde
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D
,
xx_data
+
D
);
// cell out= a+b
blas
.
VADD
(
D
,
xx_data
+
D
,
xx_data
+
D2
,
cell_out_data
);
// hidden out= act_state(cellout) * outgate
act_cell
(
D
,
cell_out_data
,
xx_data
+
D2
);
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
// prev
prev_hidden_data
=
hidden_out_data
;
prev_cell_data
=
cell_out_data
;
move_step
();
}
}
}
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
...
@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
// restore the output cell state in LoDTensor from the batch cell
// restore the output cell state in LoDTensor from the batch cell
to_seq
(
dev_ctx
,
batch_cell
,
cell_out
);
to_seq
(
dev_ctx
,
batch_cell
,
cell_out
);
}
}
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
if
(
FLAGS_seq_mode
)
{
SeqCompute
(
ctx
);
}
else
{
BatchCompute
(
ctx
);
}
}
};
};
}
// namespace operators
}
// namespace operators
...
@@ -348,7 +492,5 @@ namespace ops = paddle::operators;
...
@@ -348,7 +492,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
fusion_lstm
,
ops
::
FusionLSTMOp
,
ops
::
FusionLSTMOpMaker
,
REGISTER_OPERATOR
(
fusion_lstm
,
ops
::
FusionLSTMOp
,
ops
::
FusionLSTMOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
fusion_lstm
,
ops
::
FuisonLSTMKernel
<
float
>
,
fusion_lstm
,
ops
::
FuisonLSTMKernel
<
double
>
);
ops
::
FuisonLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FuisonLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
void
FusionSeqExpandConcatFCOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE_GT
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"FCWeight"
),
"Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FusionSeqExpandConcatFCOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"FCOut"
),
"Output(FCOut) of FusionSeqExpandConcatFCOp should not be null."
);
auto
ins_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
w_dims
=
ctx
->
GetInputDim
(
"FCWeight"
);
// (M0+M1+M2+..) x D
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2UL
,
"Input(FCWeight)'s rank must be 2."
);
const
int
D
=
w_dims
[
1
];
int
sum
=
ins_dims
[
0
][
1
];
for
(
size_t
i
=
1
;
i
<
ins_dims
.
size
();
++
i
)
{
sum
+=
ins_dims
[
i
][
1
];
}
PADDLE_ENFORCE_EQ
(
sum
,
w_dims
[
0
],
"FC height should be sum of all inputs width."
);
if
(
ctx
->
HasInput
(
"FCBias"
))
{
auto
b_dims
=
ctx
->
GetInputDim
(
"FCBias"
);
PADDLE_ENFORCE
(
b_dims
.
size
()
==
1
||
b_dims
.
size
()
==
2
,
"b_dims should be 1 or 2, get %d"
,
b_dims
.
size
());
if
(
b_dims
.
size
()
==
1
)
{
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
D
,
"FCBias shapes must be %d."
,
D
);
}
else
{
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
1
,
"FCBias shapes must be 1x%d."
,
D
);
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
D
,
"FCBias shapes must be 1x%d."
,
D
);
}
}
ctx
->
SetOutputDim
(
"Out"
,
{
ins_dims
[
0
][
0
],
D
});
// fcout should be reshape when run since can not get lod in infershape
// explicit share the ref lod
ctx
->
ShareLoD
(
"X"
,
"Out"
,
0
);
}
framework
::
OpKernelType
FusionSeqExpandConcatFCOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
)[
0
]
->
type
()),
ctx
.
device_context
());
}
void
FusionSeqExpandConcatFCOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(LoDTensor) input LodDTensors, the first one must be have ref lod "
"for sequence expand, and the rest input should have same lod."
)
.
AsDuplicable
();
AddInput
(
"FCWeight"
,
"(Tensor) the weights of fc."
);
AddInput
(
"FCBias"
,
"(Tensor, optional) the bias of fc."
).
AsDispensable
();
AddOutput
(
"Out"
,
"(LoDTensor) Output LodTensor."
);
AddOutput
(
"FCOut"
,
"(Tensor) the intermediate tensor to keep the result of fc."
"Shape is (N x D), where N is the batch size, D is the output dim of fc"
)
.
AsIntermediate
();
AddAttr
<
std
::
string
>
(
"fc_activation"
,
"(string, default: identity)"
"The activation for the result of fc."
"`identity` by default."
)
.
SetDefault
(
"identity"
)
.
InEnum
({
"sigmoid"
,
"tanh"
,
"relu"
,
"identity"
});
AddComment
(
R"DOC(
Fusion Sequence expand + concat + fc Operator.
All below conditions should be meet:
The ref_level of seq_expand should be 0.
The ref lod of seq_expand level is the first input of concat.
The other inputs should have same lod and same batch size of ref lod.
The seq len of other inputs should be 1.
The concat axis should be 1.
)DOC"
);
}
template
<
typename
T
>
class
FusionSeqExpandConcatFCOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
*
w
=
ctx
.
Input
<
Tensor
>
(
"FCWeight"
);
auto
*
b
=
ctx
.
Input
<
Tensor
>
(
"FCBias"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
fc_out
=
ctx
.
Output
<
Tensor
>
(
"FCOut"
);
auto
*
ref_in
=
ins
[
0
];
auto
ref_lod
=
ref_in
->
lod
();
auto
in1_lod
=
ins
[
1
]
->
lod
();
auto
ref_dims
=
ref_in
->
dims
();
// T x M0
auto
in1_dims
=
ins
[
1
]
->
dims
();
// N x M1
auto
w_dims
=
w
->
dims
();
const
int
N
=
ref_lod
[
0
].
size
()
-
1
;
const
int
total_T
=
ref_dims
[
0
];
const
int
M0
=
ref_dims
[
1
];
const
int
M1
=
in1_dims
[
1
];
const
int
D
=
w_dims
[
1
];
// some check and fcout should be reshape here
// since infershape can not get lod info
PADDLE_ENFORCE_EQ
(
ref_lod
.
size
(),
1UL
,
"Only support input lod size is 1."
);
PADDLE_ENFORCE_EQ
(
in1_lod
.
size
(),
1UL
,
"Only support input lod size is 1."
);
PADDLE_ENFORCE_EQ
(
in1_lod
[
0
].
size
()
-
1
,
N
,
"Batch size of all inputs should be equal."
);
PADDLE_ENFORCE_EQ
(
in1_lod
[
0
][
N
],
N
,
"Seq_length of other inputs should be 1."
);
PADDLE_ENFORCE_EQ
(
in1_dims
[
0
],
N
,
"input height should be batch size."
);
for
(
size_t
i
=
2
;
i
<
ins
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
ins
[
i
]
->
dims
()[
0
],
N
,
"All other inputs height should be equal"
);
PADDLE_ENFORCE_EQ
(
ins
[
i
]
->
lod
(),
in1_lod
,
"All other inputs should have same lod"
);
}
fc_out
->
Resize
({
N
,
D
});
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
fc_act
;
auto
&
fc_act_str
=
ctx
.
Attr
<
std
::
string
>
(
"fc_activation"
);
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
math
::
VecActivations
<
T
,
platform
::
jit
::
avx
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
}
else
{
math
::
VecActivations
<
T
,
platform
::
jit
::
isa_any
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
}
const
T
*
ref_in_data
=
ref_in
->
data
<
T
>
();
const
T
*
in1_data
=
ins
[
1
]
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
fc_out_data
=
fc_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D
,
M0
,
ref_in_data
,
w_data
,
out_data
,
b
?
b
->
data
<
T
>
()
:
NULL
);
w_data
=
w_data
+
M0
*
D
;
// first write on
blas
.
MatMul
(
N
,
D
,
M1
,
in1_data
,
w_data
,
fc_out_data
);
w_data
=
w_data
+
M1
*
D
;
for
(
size_t
i
=
2
;
i
<
ins
.
size
();
++
i
)
{
// add on
const
T
*
in_data
=
ins
[
i
]
->
data
<
T
>
();
const
int
K
=
ins
[
i
]
->
dims
()[
1
];
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
N
,
D
,
K
,
static_cast
<
T
>
(
1
),
in_data
,
K
,
w_data
,
D
,
static_cast
<
T
>
(
1
),
fc_out_data
,
D
);
w_data
=
w_data
+
K
*
D
;
}
T
*
cur_out_data
=
out_data
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
int
seq_len
=
ref_lod
[
0
][
i
+
1
]
-
ref_lod
[
0
][
i
];
T
*
src
=
fc_out_data
+
i
*
D
;
for
(
int
step
=
0
;
step
<
seq_len
;
++
step
)
{
blas
.
VADD
(
D
,
cur_out_data
,
src
,
cur_out_data
);
cur_out_data
=
cur_out_data
+
D
;
}
}
fc_act
(
total_T
*
D
,
out_data
,
out_data
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_seqexpand_concat_fc
,
ops
::
FusionSeqExpandConcatFCOp
,
ops
::
FusionSeqExpandConcatFCOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
fusion_seqexpand_concat_fc
,
ops
::
FusionSeqExpandConcatFCOpKernel
<
float
>
,
ops
::
FusionSeqExpandConcatFCOpKernel
<
double
>
);
paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
class
FusionSeqExpandConcatFCOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusionSeqExpandConcatFCOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat.cu
浏览文件 @
b98b7440
...
@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
...
@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dev_ins_data
,
dev_ins_col_data
,
static_cast
<
int
>
(
inputs_col
.
size
()),
dev_ins_data
,
dev_ins_col_data
,
static_cast
<
int
>
(
inputs_col
.
size
()),
out_row
,
out_col
,
output
->
data
<
T
>
());
out_row
,
out_col
,
output
->
data
<
T
>
());
}
}
// Wait() must be called because `inputs_data` may be destructed before
// kernel ends
context
.
Wait
();
}
}
};
};
...
@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
input
.
data
<
T
>
(),
in_row
,
in_col
,
dev_outs_col_data
,
input
.
data
<
T
>
(),
in_row
,
in_col
,
dev_outs_col_data
,
static_cast
<
int
>
(
outputs_cols
.
size
()),
dev_out_gpu_data
);
static_cast
<
int
>
(
outputs_cols
.
size
()),
dev_out_gpu_data
);
}
}
// Wait() must be called because `outputs_data` may be destructed before
// kernel ends
context
.
Wait
();
}
}
};
};
...
...
paddle/fluid/operators/math/cpu_vec.h
浏览文件 @
b98b7440
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <cmath>
#include <cmath>
#include <functional>
#include <string>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#ifdef __AVX__
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
b98b7440
...
@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
...
@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
DEFINE_CPU_TRANS
(
1
);
DEFINE_CPU_TRANS
(
1
);
DEFINE_CPU_TRANS
(
2
);
DEFINE_CPU_TRANS
(
2
);
...
...
paddle/fluid/operators/math/math_function.cu
浏览文件 @
b98b7440
...
@@ -36,7 +36,8 @@ template struct SetConstant<platform::CUDADeviceContext, bool>;
...
@@ -36,7 +36,8 @@ template struct SetConstant<platform::CUDADeviceContext, bool>;
#define DEFINE_GPU_TRANS(RANK) \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;
DEFINE_GPU_TRANS
(
1
);
DEFINE_GPU_TRANS
(
1
);
DEFINE_GPU_TRANS
(
2
);
DEFINE_GPU_TRANS
(
2
);
...
...
paddle/fluid/operators/math/padding.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
PadFunction
(
const
framework
::
ExecutionContext
&
context
,
const
std
::
vector
<
int
>&
pads
,
const
framework
::
Tensor
&
src
,
T
pad_value
,
framework
::
Tensor
*
out
)
{
Eigen
::
array
<
std
::
pair
<
int
,
int
>
,
D
>
paddings
;
for
(
size_t
i
=
0
;
i
<
paddings
.
size
();
++
i
)
{
paddings
[
i
].
first
=
pads
[
i
*
2
];
paddings
[
i
].
second
=
pads
[
i
*
2
+
1
];
}
auto
src_tensor
=
EigenTensor
<
T
,
D
>::
From
(
src
);
auto
out_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
out
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
out_tensor
.
device
(
place
)
=
src_tensor
.
pad
(
paddings
,
pad_value
);
}
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
PadGradFunction
(
const
framework
::
ExecutionContext
&
context
,
const
std
::
vector
<
int
>&
pads
,
const
framework
::
Tensor
&
src
,
framework
::
Tensor
*
d_out
)
{
Eigen
::
array
<
std
::
pair
<
int
,
int
>
,
D
>
paddings
;
for
(
size_t
i
=
0
;
i
<
paddings
.
size
();
++
i
)
{
paddings
[
i
].
first
=
-
pads
[
i
*
2
];
paddings
[
i
].
second
=
-
pads
[
i
*
2
+
1
];
}
auto
d_out_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
d_out
);
auto
src_tensor
=
EigenTensor
<
T
,
D
>::
From
(
src
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
d_out_tensor
.
device
(
place
)
=
src_tensor
.
pad
(
paddings
,
0
);
}
template
<
typename
DeviceContext
,
typename
T
>
void
PaddingFunctor
(
int
rank
,
const
framework
::
ExecutionContext
&
context
,
const
std
::
vector
<
int
>&
pads
,
T
pad_value
,
const
framework
::
Tensor
&
src
,
framework
::
Tensor
*
out
)
{
switch
(
rank
)
{
case
1
:
PadFunction
<
DeviceContext
,
T
,
1
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
case
2
:
PadFunction
<
DeviceContext
,
T
,
2
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
case
3
:
PadFunction
<
DeviceContext
,
T
,
3
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
case
4
:
PadFunction
<
DeviceContext
,
T
,
4
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
case
5
:
PadFunction
<
DeviceContext
,
T
,
5
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
case
6
:
PadFunction
<
DeviceContext
,
T
,
6
>
(
context
,
pads
,
src
,
pad_value
,
out
);
break
;
default:
PADDLE_THROW
(
"PadOp only support tensors with no more than 6 dimensions."
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
void
PaddingGradFunctor
(
int
rank
,
const
framework
::
ExecutionContext
&
context
,
const
std
::
vector
<
int
>&
pads
,
const
framework
::
Tensor
&
src
,
framework
::
Tensor
*
out
)
{
switch
(
rank
)
{
case
1
:
PadGradFunction
<
DeviceContext
,
T
,
1
>
(
context
,
pads
,
src
,
out
);
break
;
case
2
:
PadGradFunction
<
DeviceContext
,
T
,
2
>
(
context
,
pads
,
src
,
out
);
break
;
case
3
:
PadGradFunction
<
DeviceContext
,
T
,
3
>
(
context
,
pads
,
src
,
out
);
break
;
case
4
:
PadGradFunction
<
DeviceContext
,
T
,
4
>
(
context
,
pads
,
src
,
out
);
break
;
case
5
:
PadGradFunction
<
DeviceContext
,
T
,
5
>
(
context
,
pads
,
src
,
out
);
break
;
case
6
:
PadGradFunction
<
DeviceContext
,
T
,
6
>
(
context
,
pads
,
src
,
out
);
break
;
default:
PADDLE_THROW
(
"PadOp only support tensors with no more than 6 dimensions."
);
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sequence2batch.cc
浏览文件 @
b98b7440
...
@@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
...
@@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
auto
width
=
dst_dims
[
1
];
auto
width
=
dst_dims
[
1
];
auto
*
src_data
=
src
.
data
<
T
>
();
auto
*
src_data
=
src
.
data
<
T
>
();
auto
*
dst_data
=
dst
->
data
<
T
>
();
auto
*
dst_data
=
dst
->
data
<
T
>
();
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
int
sz
=
width
*
sizeof
(
T
);
if
(
is_src_index
)
{
if
(
is_src_index
)
{
memcpy
(
dst_data
+
i
*
width
,
src_data
+
index
[
i
]
*
width
,
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
width
*
sizeof
(
T
));
memcpy
(
dst_data
+
i
*
width
,
src_data
+
index
[
i
]
*
width
,
sz
);
}
}
else
{
}
else
{
memcpy
(
dst_data
+
index
[
i
]
*
width
,
src_data
+
i
*
width
,
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
width
*
sizeof
(
T
)
);
memcpy
(
dst_data
+
index
[
i
]
*
width
,
src_data
+
i
*
width
,
sz
);
}
}
}
}
}
}
...
...
paddle/fluid/operators/math/sequence_padding.cc
浏览文件 @
b98b7440
...
@@ -18,65 +18,86 @@ namespace paddle {
...
@@ -18,65 +18,86 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
<
typename
T
>
void
CopyValidData
(
framework
::
Tensor
*
dst_tensor
,
const
framework
::
Tensor
*
src_tensor
,
const
framework
::
Vector
<
size_t
>&
seq_offsets
,
int
pad_seq_len
,
int
step_width
,
bool
norm_by_len
,
CopyType
type
,
PadLayout
layout
)
{
int
seq_num
=
seq_offsets
.
size
()
-
1
;
const
T
*
src_data
=
src_tensor
->
data
<
T
>
();
T
*
dst_data
=
dst_tensor
->
data
<
T
>
();
int
seq_cpy_gap
=
step_width
;
int
pad_cpy_gap
=
layout
==
kBatchLengthWidth
?
step_width
:
seq_num
*
step_width
;
for
(
int
seq_idx
=
0
;
seq_idx
<
seq_num
;
++
seq_idx
)
{
int
valid_seq_len
=
seq_offsets
[
seq_idx
+
1
]
-
seq_offsets
[
seq_idx
];
PADDLE_ENFORCE_GE
(
pad_seq_len
,
valid_seq_len
,
"The padded sequence length can not be less than its original length."
);
int
seq_data_offset
=
seq_offsets
[
seq_idx
]
*
step_width
;
int
pad_data_offset
=
layout
==
kBatchLengthWidth
?
seq_idx
*
pad_seq_len
*
step_width
:
seq_idx
*
step_width
;
float
scale
=
1.0
f
/
static_cast
<
float
>
(
valid_seq_len
);
for
(
int
step_idx
=
0
;
step_idx
<
valid_seq_len
;
++
step_idx
)
{
const
T
*
src
=
src_data
+
(
type
==
kSeqToPad
?
seq_data_offset
:
pad_data_offset
);
T
*
dst
=
dst_data
+
(
type
==
kSeqToPad
?
pad_data_offset
:
seq_data_offset
);
memcpy
(
dst
,
src
,
step_width
*
sizeof
(
T
));
if
(
norm_by_len
)
{
for
(
int
i
=
0
;
i
<
step_width
;
++
i
)
{
*
(
dst
+
i
)
*=
scale
;
}
}
seq_data_offset
+=
seq_cpy_gap
;
pad_data_offset
+=
pad_cpy_gap
;
}
}
}
template
<
typename
T
>
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq
,
framework
::
Tensor
*
padding
,
const
framework
::
LoDTensor
&
seq_tensor
,
bool
norm_by_times
)
{
framework
::
LoDTensor
*
pad_tensor
,
auto
lod
=
seq
.
lod
();
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
PADDLE_ENFORCE_GT
(
lod
.
size
(),
0UL
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
"The LoD of LoDTensor seq should not be null."
);
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
const
size_t
level
=
0
;
const
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
framework
::
LoD
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
auto
seq_dims
=
seq
.
dims
();
if
(
pad_seq_len
==
-
1
)
{
PADDLE_ENFORCE_EQ
(
seq_dims
[
0
],
pad_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
static_cast
<
int64_t
>
(
abs_offset_lod
[
level
].
back
()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."
);
auto
padding_dims
=
padding
->
dims
();
PADDLE_ENFORCE_EQ
(
padding_dims
.
size
(),
3UL
,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width]."
);
const
int64_t
max_sequence_length
=
MaximumSequenceLength
(
lod
,
level
);
PADDLE_ENFORCE_EQ
(
padding_dims
[
0
],
max_sequence_length
,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq."
);
const
int64_t
num_sequences
=
abs_offset_lod
[
level
].
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
padding_dims
[
1
],
num_sequences
,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq."
);
const
int64_t
sequence_width
=
seq
.
numel
()
/
seq_dims
[
0
];
PADDLE_ENFORCE_EQ
(
padding_dims
[
2
],
sequence_width
,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."
);
const
T
*
seq_data
=
seq
.
data
<
T
>
();
T
*
padding_data
=
padding
->
data
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
max_sequence_length
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
num_sequences
;
++
j
)
{
int64_t
start_pos
=
abs_offset_lod
[
level
][
j
];
int64_t
sequence_length
=
abs_offset_lod
[
level
][
j
+
1
]
-
start_pos
;
if
(
i
<
sequence_length
)
{
// i > 0 => sequence_length > 0
T
scale
=
norm_by_times
?
(
1.0
f
/
static_cast
<
T
>
(
sequence_length
))
:
1.0
f
;
for
(
int64_t
k
=
0
;
k
<
sequence_width
;
++
k
)
{
padding_data
[(
i
*
num_sequences
+
j
)
*
sequence_width
+
k
]
=
seq_data
[(
start_pos
+
i
)
*
sequence_width
+
k
]
*
scale
;
}
}
}
else
{
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
memset
(
padding_data
+
(
i
*
num_sequences
+
j
)
*
sequence_width
,
0
,
sequence_width
*
sizeof
(
T
));
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
PADDLE_ENFORCE
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'."
);
// fill padding value
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
if
(
pad_value
.
numel
()
==
1
)
{
for
(
int
i
=
0
;
i
<
pad_tensor
->
numel
();
++
i
)
{
pad_data
[
i
]
=
*
pad_value_data
;
}
}
}
else
{
for
(
int
i
=
0
;
i
<
pad_tensor
->
numel
();
i
+=
step_width
)
{
memcpy
(
pad_data
+
i
,
pad_value_data
,
step_width
*
sizeof
(
T
));
}
}
}
}
CopyValidData
<
T
>
(
pad_tensor
,
&
seq_tensor
,
seq_offsets
,
pad_seq_len
,
step_width
,
norm_by_times
,
kSeqToPad
,
layout
);
}
}
};
};
...
@@ -84,62 +105,35 @@ template <typename T>
...
@@ -84,62 +105,35 @@ template <typename T>
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
framework
::
LoDTensor
*
seq
,
const
framework
::
Tensor
&
padding
,
const
framework
::
LoDTensor
&
pad_tensor
,
bool
norm_by_times
)
{
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
auto
lod
=
seq
->
lod
();
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
PADDLE_ENFORCE_GT
(
lod
.
size
(),
0UL
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
"The LoD of LoDTensor seq should not be null."
);
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
size_t
level
=
0
;
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
framework
::
LoD
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
auto
seq_dims
=
seq
->
dims
();
PADDLE_ENFORCE_EQ
(
seq_dims
[
0
],
static_cast
<
int64_t
>
(
abs_offset_lod
[
level
].
back
()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."
);
auto
padding_dims
=
padding
.
dims
();
PADDLE_ENFORCE_EQ
(
padding_dims
.
size
(),
3UL
,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width]."
);
const
int64_t
max_sequence_length
=
MaximumSequenceLength
(
lod
,
level
);
PADDLE_ENFORCE_EQ
(
padding_dims
[
0
],
max_sequence_length
,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq."
);
const
int64_t
num_sequences
=
abs_offset_lod
[
level
].
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
padding_dims
[
1
],
num_sequences
,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."
);
const
int64_t
sequence_width
=
seq
->
numel
()
/
seq_dims
[
0
];
PADDLE_ENFORCE_EQ
(
padding_dims
[
2
],
sequence_width
,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."
);
const
T
*
padding_data
=
padding
.
data
<
T
>
();
T
*
seq_data
=
seq
->
data
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
num_sequences
;
++
i
)
{
int64_t
start_pos
=
abs_offset_lod
[
level
][
i
];
int64_t
sequence_length
=
abs_offset_lod
[
level
][
i
+
1
]
-
start_pos
;
for
(
int64_t
j
=
0
;
j
<
sequence_length
;
++
j
)
{
// sequence_width > j > 0
T
scale
=
norm_by_times
?
(
1.0
f
/
static_cast
<
T
>
(
sequence_length
))
:
1.0
f
;
for
(
int64_t
k
=
0
;
k
<
sequence_width
;
++
k
)
{
seq_data
[(
start_pos
+
j
)
*
sequence_width
+
k
]
=
padding_data
[(
j
*
num_sequences
+
i
)
*
sequence_width
+
k
]
*
scale
;
}
}
}
}
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
CopyValidData
<
T
>
(
seq_tensor
,
&
pad_tensor
,
seq_offsets
,
pad_seq_len
,
step_width
,
norm_by_times
,
kPadToSeq
,
layout
);
}
}
};
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
b98b7440
...
@@ -19,41 +19,32 @@ namespace paddle {
...
@@ -19,41 +19,32 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
<
typename
T
,
bool
NormByTimes
,
bool
Padding
>
template
<
typename
T
,
CopyType
Type
>
__global__
void
SequencePaddingKernel
(
T
*
padding
,
T
*
sequence
,
__global__
void
SequencePaddingKernel
(
const
size_t
*
sequence_start_positions
,
T
*
dst
,
const
T
*
src
,
const
T
*
pad_value
,
bool
is_constant_pad
,
const
size_t
sequence_width
,
const
size_t
*
seq_offsets
,
const
size_t
seq_num
,
const
size_t
pad_seq_len
,
const
size_t
max_sequence_length
,
const
size_t
step_width
,
bool
norm_by_len
,
const
PadLayout
layout
)
{
const
size_t
num_sequences
)
{
size_t
seq_idx
=
blockIdx
.
y
;
size_t
padding_idx
=
blockIdx
.
y
;
size_t
seq_len
=
seq_offsets
[
seq_idx
+
1
]
-
seq_offsets
[
seq_idx
];
size_t
start_pos
=
sequence_start_positions
[
padding_idx
];
size_t
sequence_length
=
size_t
step_idx
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
sequence_start_positions
[
padding_idx
+
1
]
-
start_pos
;
size_t
seq_data_offset
=
(
seq_offsets
[
seq_idx
]
+
step_idx
)
*
step_width
;
size_t
pad_data_offset
=
layout
==
kBatchLengthWidth
size_t
sequence_idx
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
?
(
seq_idx
*
pad_seq_len
+
step_idx
)
*
step_width
size_t
padding_base_idx
=
:
(
step_idx
*
seq_num
+
seq_idx
)
*
step_width
;
(
sequence_idx
*
num_sequences
+
padding_idx
)
*
sequence_width
;
size_t
sequence_base_idx
=
(
start_pos
+
sequence_idx
)
*
sequence_width
;
T
*
dst_data
=
dst
+
(
Type
==
kSeqToPad
?
pad_data_offset
:
seq_data_offset
);
const
T
*
src_data
=
if
(
sequence_idx
<
sequence_length
)
{
src
+
(
Type
==
kSeqToPad
?
seq_data_offset
:
pad_data_offset
);
T
scale
=
NormByTimes
?
(
1.0
f
/
static_cast
<
T
>
(
sequence_length
))
:
1.0
f
;
if
(
Padding
)
{
if
(
step_idx
<
seq_len
)
{
/* sequence -> padding */
float
scale
=
norm_by_len
?
(
1.0
f
/
static_cast
<
float
>
(
seq_len
))
:
1.0
f
;
for
(
size_t
i
=
threadIdx
.
x
;
i
<
sequence_width
;
i
+=
blockDim
.
x
)
{
for
(
size_t
i
=
threadIdx
.
x
;
i
<
step_width
;
i
+=
blockDim
.
x
)
{
padding
[
padding_base_idx
+
i
]
=
scale
*
sequence
[
sequence_base_idx
+
i
];
dst_data
[
i
]
=
scale
*
src_data
[
i
];
}
}
else
{
/* padding -> sequence */
for
(
size_t
i
=
threadIdx
.
x
;
i
<
sequence_width
;
i
+=
blockDim
.
x
)
{
sequence
[
sequence_base_idx
+
i
]
=
scale
*
padding
[
padding_base_idx
+
i
];
}
}
}
else
if
(
sequence_idx
<
max_sequence_length
)
{
if
(
Padding
)
{
/* sequence -> padding */
for
(
size_t
i
=
threadIdx
.
x
;
i
<
sequence_width
;
i
+=
blockDim
.
x
)
{
padding
[
padding_base_idx
+
i
]
=
0
;
}
}
}
else
if
(
step_idx
<
pad_seq_len
&&
Type
==
kSeqToPad
)
{
for
(
size_t
i
=
threadIdx
.
x
;
i
<
step_width
;
i
+=
blockDim
.
x
)
{
dst_data
[
i
]
=
is_constant_pad
?
pad_value
[
0
]
:
pad_value
[
i
];
}
}
}
}
}
}
...
@@ -62,74 +53,59 @@ template <typename T>
...
@@ -62,74 +53,59 @@ template <typename T>
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq
,
framework
::
Tensor
*
padding
,
const
framework
::
LoDTensor
&
seq_tensor
,
bool
norm_by_times
)
{
framework
::
LoDTensor
*
pad_tensor
,
auto
lod
=
seq
.
lod
();
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
PADDLE_ENFORCE_GT
(
lod
.
size
(),
0UL
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
"The lod of LoDTensor seq should not be null."
);
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
const
size_t
level
=
0
;
const
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
framework
::
LoD
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
auto
seq_dims
=
seq
.
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
PADDLE_ENFORCE_EQ
(
seq_dims
[
0
],
if
(
pad_seq_len
==
-
1
)
{
static_cast
<
int64_t
>
(
abs_offset_lod
[
level
].
back
()),
pad_seq_len
=
max_seq_len
;
"The first dimension of LoDTensor seq should be "
}
"equal to the sum of all sequences's length."
);
PADDLE_ENFORCE_GE
(
pad_seq_len
,
max_seq_len
,
"The pad_seq_len must be equal to or greater than the "
auto
padding_dims
=
padding
->
dims
();
"original max sequence length."
);
PADDLE_ENFORCE_EQ
(
padding_dims
.
size
(),
3UL
,
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
"The input padding should be a 3-D Tensor of shape "
int
seq_num
=
seq_offsets
.
size
()
-
1
;
"[max_sequence_length, num_sequences, sequence_width]."
);
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
int64_t
max_sequence_length
=
MaximumSequenceLength
(
lod
,
level
);
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
padding_dims
[
0
],
max_sequence_length
,
PADDLE_ENFORCE
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
"The first dimension of Tensor padding should be the "
"The numel of 'pad_value' can only be 1 or be equal to the "
"maximum length of all sequences in LoDTensor seq."
);
"'step_width'."
);
const
int64_t
num_sequences
=
abs_offset_lod
[
level
].
size
()
-
1
;
if
(
!
norm_by_times
&&
seq_num
==
1UL
&&
pad_seq_len
==
max_seq_len
)
{
PADDLE_ENFORCE_EQ
(
padding_dims
[
1
],
num_sequences
,
TensorCopy
(
seq_tensor
,
context
.
GetPlace
(),
context
,
pad_tensor
);
"The second dimension of Tensor padding should be the "
pad_tensor
->
Resize
(
pad_tensor_dims
);
"number of sequences in LoDTensor seq."
);
const
int64_t
sequence_width
=
seq
.
numel
()
/
seq_dims
[
0
];
PADDLE_ENFORCE_EQ
(
padding_dims
[
2
],
sequence_width
,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."
);
if
(
!
norm_by_times
&&
num_sequences
==
1UL
)
{
TensorCopy
(
seq
,
context
.
GetPlace
(),
context
,
padding
);
padding
->
Resize
(
padding_dims
);
return
;
return
;
}
}
const
int
64_t
kBlockSize
=
512
;
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
* and at least 8 elements for each thread.
*/
*/
size_t
block_dim_x
=
size_t
block_dim_x
=
std
::
min
(((((
s
equence
_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
std
::
min
(((((
s
tep
_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
max_sequence_length
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
num_sequences
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
seq_data
=
seq
.
data
<
T
>
();
const
T
*
seq_data
=
seq_tensor
.
data
<
T
>
();
T
*
padding_data
=
padding
->
data
<
T
>
();
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
if
(
norm_by_times
)
{
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
SequencePaddingKernel
<
T
,
1
,
1
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
padding_data
,
const_cast
<
T
*>
(
seq_data
),
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
abs_offset_lod
[
level
].
CUDAData
(
context
.
GetPlace
()),
sequence_width
,
pad_data
,
seq_data
,
pad_value_data
,
pad_value
.
numel
()
==
1
,
max_sequence_length
,
num_sequences
);
seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
}
else
{
step_width
,
norm_by_times
,
layout
);
SequencePaddingKernel
<
T
,
0
,
1
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
padding_data
,
const_cast
<
T
*>
(
seq_data
),
abs_offset_lod
[
level
].
CUDAData
(
context
.
GetPlace
()),
sequence_width
,
max_sequence_length
,
num_sequences
);
}
}
}
};
};
...
@@ -137,79 +113,62 @@ template <typename T>
...
@@ -137,79 +113,62 @@ template <typename T>
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
framework
::
LoDTensor
*
seq
,
const
framework
::
Tensor
&
padding
,
const
framework
::
LoDTensor
&
pad_tensor
,
bool
norm_by_times
)
{
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
auto
lod
=
seq
->
lod
();
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
PADDLE_ENFORCE_GT
(
lod
.
size
(),
0UL
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
"The lod of LoDTensor seq should not be null."
);
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
size_t
level
=
0
;
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
framework
::
LoD
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
auto
seq_dims
=
seq
->
dims
();
pad_seq_len
=
max_seq_len
;
PADDLE_ENFORCE_EQ
(
seq_dims
[
0
],
}
static_cast
<
int64_t
>
(
abs_offset_lod
[
level
].
back
()),
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
"The first dimension of LoDTensor seq should be "
int
seq_num
=
seq_offsets
.
size
()
-
1
;
"equal to the sum of all sequences's length."
);
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
auto
padding_dims
=
padding
.
dims
();
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
padding_dims
.
size
(),
3UL
,
"The input padding should be a 3-D Tensor of shape "
if
(
!
norm_by_times
&&
seq_num
==
1UL
&&
pad_seq_len
==
max_seq_len
)
{
"[max_sequnece_length, num_sequences, sequence_width]."
);
TensorCopy
(
pad_tensor
,
context
.
GetPlace
(),
context
,
seq_tensor
);
seq_tensor
->
Resize
(
seq_tensor_dims
);
int64_t
max_sequence_length
=
MaximumSequenceLength
(
lod
,
level
);
PADDLE_ENFORCE_EQ
(
padding_dims
[
0
],
max_sequence_length
,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq."
);
const
int64_t
num_sequences
=
abs_offset_lod
[
level
].
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
padding_dims
[
1
],
num_sequences
,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."
);
const
int64_t
sequence_width
=
seq
->
numel
()
/
seq_dims
[
0
];
PADDLE_ENFORCE_EQ
(
padding_dims
[
2
],
sequence_width
,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."
);
if
(
!
norm_by_times
&&
num_sequences
==
1UL
)
{
TensorCopy
(
padding
,
context
.
GetPlace
(),
context
,
seq
);
seq
->
Resize
(
seq_dims
);
return
;
return
;
}
}
const
int
64_t
kBlockSize
=
512
;
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
* and at least 8 elements for each thread.
*/
*/
size_t
block_dim_x
=
size_t
block_dim_x
=
std
::
min
(((((
s
equence
_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
std
::
min
(((((
s
tep
_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
max_sequence_length
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
num_sequences
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
padding_data
=
padding
.
data
<
T
>
();
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq
->
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
if
(
norm_by_times
)
{
SequencePaddingKernel
<
T
,
1
,
0
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
const_cast
<
T
*>
(
padding_data
),
seq_data
,
seq_data
,
pad_data
,
nullptr
,
false
,
abs_offset_lod
[
level
].
CUDAData
(
context
.
GetPlace
()),
sequence_width
,
seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
max_sequence_length
,
num_sequences
);
step_width
,
norm_by_times
,
layout
);
}
else
{
SequencePaddingKernel
<
T
,
0
,
0
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
const_cast
<
T
*>
(
padding_data
),
seq_data
,
abs_offset_lod
[
level
].
CUDAData
(
context
.
GetPlace
()),
sequence_width
,
max_sequence_length
,
num_sequences
);
}
}
}
};
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/math/sequence_padding.h
浏览文件 @
b98b7440
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
...
@@ -22,17 +23,33 @@ namespace paddle {
...
@@ -22,17 +23,33 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
inline
static
size_t
MaximumSequenceLength
(
const
framework
::
LoD
&
lod
,
enum
PadLayout
{
kBatchLengthWidth
=
0
,
kLengthBatchWidth
};
const
size_t
level
)
{
const
size_t
num_sequences
=
lod
[
level
].
size
()
-
1
;
enum
CopyType
{
kSeqToPad
,
kPadToSeq
};
size_t
max_sequence_length
=
0
;
framework
::
LoD
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
inline
static
size_t
MaximumSequenceLength
(
for
(
size_t
i
=
0
;
i
<
num_sequences
;
++
i
)
{
const
framework
::
Vector
<
size_t
>&
seq_offset
)
{
max_sequence_length
=
size_t
seq_num
=
seq_offset
.
size
()
-
1
;
std
::
max
(
max_sequence_length
,
size_t
max_seq_len
=
0
;
abs_offset_lod
[
level
][
i
+
1
]
-
abs_offset_lod
[
level
][
i
]);
for
(
size_t
i
=
0
;
i
<
seq_num
;
++
i
)
{
max_seq_len
=
std
::
max
(
max_seq_len
,
seq_offset
[
i
+
1
]
-
seq_offset
[
i
]);
}
}
return
max_sequence_length
;
return
max_seq_len
;
}
inline
static
void
CheckDims
(
const
framework
::
DDim
&
seq_tensor_dims
,
const
framework
::
DDim
&
pad_tensor_dims
,
const
framework
::
Vector
<
size_t
>&
seq_offset
,
int64_t
padded_seq_len
,
int64_t
step_width
,
const
PadLayout
&
layout
)
{
PADDLE_ENFORCE_EQ
(
static_cast
<
size_t
>
(
seq_tensor_dims
[
0
]),
seq_offset
.
back
(),
"Value of 1st dimension of the sequence tensor should be "
"equal to sum of lengths of all sequences."
);
PADDLE_ENFORCE
(
seq_tensor_dims
.
size
()
+
1
==
pad_tensor_dims
.
size
()
||
seq_tensor_dims
.
size
()
==
pad_tensor_dims
.
size
(),
"pad_tensor's rank should be 1 greater than seq_tensor's "
"rank, or be equal with it."
);
}
}
/*
/*
...
@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
...
@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
PaddingLoDTensorFunctor
{
class
PaddingLoDTensorFunctor
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq
,
void
operator
()(
const
DeviceContext
&
context
,
framework
::
Tensor
*
padding
,
bool
norm_by_times
);
const
framework
::
LoDTensor
&
seq_tensor
,
framework
::
LoDTensor
*
pad_tensor
,
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
UnpaddingLoDTensorFunctor
{
class
UnpaddingLoDTensorFunctor
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
framework
::
LoDTensor
*
seq
,
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
padding
,
bool
norm_by_times
);
const
framework
::
LoDTensor
&
pad_tensor
,
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
);
};
};
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/sequence_padding_test.cc
浏览文件 @
b98b7440
...
@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
...
@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
paddle
::
framework
::
LoDTensor
cpu_seq_back
;
paddle
::
framework
::
LoDTensor
cpu_seq_back
;
paddle
::
framework
::
LoDTensor
seq
;
paddle
::
framework
::
LoDTensor
seq
;
paddle
::
framework
::
LoDTensor
seq_back
;
paddle
::
framework
::
LoDTensor
seq_back
;
paddle
::
framework
::
Tensor
padding
;
paddle
::
framework
::
LoDTensor
padding
;
paddle
::
framework
::
LoDTensor
cpu_pad_value
;
paddle
::
framework
::
LoDTensor
pad_value
;
const
size_t
level
=
lod
.
size
()
-
1
;
const
size_t
level
=
lod
.
size
()
-
1
;
auto
seq_dims
=
auto
seq_dims
=
...
@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
...
@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
}
}
const
size_t
max_sequence_length
=
const
size_t
max_sequence_length
=
paddle
::
operators
::
math
::
MaximumSequenceLength
(
lod
,
level
);
paddle
::
operators
::
math
::
MaximumSequenceLength
(
lod
[
level
]
);
const
size_t
num_sequences
=
lod
[
level
].
size
()
-
1
;
const
size_t
num_sequences
=
lod
[
level
].
size
()
-
1
;
auto
padding_dims
=
auto
padding_dims
=
paddle
::
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
max_sequence_length
),
paddle
::
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
max_sequence_length
),
static_cast
<
int64_t
>
(
num_sequences
),
static_cast
<
int64_t
>
(
num_sequences
),
static_cast
<
int64_t
>
(
sequence_width
)});
static_cast
<
int64_t
>
(
sequence_width
)});
padding
.
mutable_data
<
T
>
(
padding_dims
,
*
place
);
padding
.
mutable_data
<
T
>
(
padding_dims
,
*
place
);
T
*
pad_value_data
=
cpu_pad_value
.
mutable_data
<
T
>
({
1
},
paddle
::
platform
::
CPUPlace
());
*
pad_value_data
=
static_cast
<
T
>
(
0
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
pad_value
=
cpu_pad_value
;
}
else
{
TensorCopySync
(
cpu_pad_value
,
*
place
,
&
pad_value
);
}
paddle
::
operators
::
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
paddle
::
operators
::
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
*
context
,
seq
,
&
padding
,
false
);
*
context
,
seq
,
&
padding
,
pad_value
,
-
1
,
0
,
false
,
paddle
::
operators
::
math
::
kLengthBatchWidth
);
seq_back
.
set_lod
(
lod
);
seq_back
.
set_lod
(
lod
);
seq_back
.
mutable_data
<
T
>
(
seq_dims
,
*
place
);
seq_back
.
mutable_data
<
T
>
(
seq_dims
,
*
place
);
paddle
::
operators
::
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
paddle
::
operators
::
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
*
context
,
&
seq_back
,
padding
,
false
);
*
context
,
padding
,
&
seq_back
,
-
1
,
0
,
false
,
paddle
::
operators
::
math
::
kLengthBatchWidth
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
cpu_seq_back
=
seq_back
;
cpu_seq_back
=
seq_back
;
...
...
paddle/fluid/operators/pad_constant_like_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
class
PadConstantLikeOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of PadConstantLikeOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) of PadConstantLikeOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of PadConstantLikeOp should not be null."
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
y_dim
=
ctx
->
GetInputDim
(
"Y"
);
PADDLE_ENFORCE_EQ
(
x_dim
.
size
(),
y_dim
.
size
(),
"The dimention of X and Y should be the same."
);
for
(
int
i
=
0
;
i
<
x_dim
.
size
();
++
i
)
{
PADDLE_ENFORCE_GE
(
x_dim
[
i
],
y_dim
[
i
]);
}
ctx
->
SetOutputDim
(
"Out"
,
x_dim
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Y"
)
->
type
()),
ctx
.
device_context
());
}
};
class
PadConstantLikeOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)"
);
AddInput
(
"Y"
,
"The input of pad_constant_like op. "
"The input should be a k-D tensor(k > 0 and k < 7)"
);
AddOutput
(
"Out"
,
"The output of pad_constant_like op. "
"A tensor with the same shape as X."
);
AddAttr
<
float
>
(
"pad_value"
,
"(float, default 0.0) "
"The value to fill the padded areas."
)
.
SetDefault
(
0.0
f
);
AddComment
(
R"DOC(
PadConstantLikeOp Operator.
Pad input(Y) with a pad_value, the number of values padded to the edges of each
axis is specified by the difference of the shape of X and Y.
((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
each axis.
The input should be a k-D tensor(k > 0 and k < 7). As an example:
case1:
Given:
X = [[1, 2],
[3, 4],
[1, 2],
[3, 4]]],
X.shape = (4, 2)
Y = [[5, 6],
[7, 8]],
Y.shape = (2, 2)
And
pad_value = 0,
Return:
Out = [[5, 6],
[7, 8],
[0, 0],
[0, 0]]
Out.shape = (4, 2)
case2:
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
)DOC"
);
}
};
class
PadConstantLikeOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
auto
y_dim
=
ctx
->
GetInputDim
(
"Y"
);
auto
dout_dim
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
));
PADDLE_ENFORCE_EQ
(
dout_dim
.
size
(),
y_dim
.
size
(),
"The dimention of X and Y should be the same."
);
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
ctx
->
SetOutputDim
(
y_grad_name
,
y_dim
);
ctx
->
ShareLoD
(
"Y"
,
/*->*/
y_grad_name
);
for
(
int
i
=
0
;
i
<
y_dim
.
size
();
++
i
)
{
PADDLE_ENFORCE_GE
(
dout_dim
[
i
],
y_dim
[
i
]);
}
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Y"
)
->
type
()),
ctx
.
device_context
());
}
};
class
PadConstantLikeOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
bind
=
new
framework
::
OpDesc
();
bind
->
SetType
(
"pad_constant_like_grad"
);
bind
->
SetInput
(
"Y"
,
Input
(
"Y"
));
bind
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
bind
->
SetOutput
(
framework
::
GradVarName
(
"Y"
),
InputGrad
(
"Y"
));
bind
->
SetAttrMap
(
Attrs
());
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
bind
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
pad_constant_like
,
ops
::
PadConstantLikeOp
,
ops
::
PadConstantLikeOpMaker
,
ops
::
PadConstantLikeOpGradMaker
);
REGISTER_OPERATOR
(
pad_constant_like_grad
,
ops
::
PadConstantLikeOpGrad
);
REGISTER_OP_CPU_KERNEL
(
pad_constant_like
,
ops
::
PadConstantLikeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
PadConstantLikeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
pad_constant_like_grad
,
ops
::
PadConstantLikeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
PadConstantLikeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/pad_constant_like_op.cu
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/pad_constant_like_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
pad_constant_like
,
ops
::
PadConstantLikeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
PadConstantLikeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
pad_constant_like_grad
,
ops
::
PadConstantLikeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
PadConstantLikeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/pad_constant_like_op.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
PadConstantLikeKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
in_x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
in_y
=
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
if
(
in_x
->
dims
()
==
in_y
->
dims
())
{
// TensorCopy(in_y, context.GetPlace(), context, out);
out
->
ShareDataWith
(
*
in_y
);
return
;
}
T
pad_value
=
context
.
Attr
<
T
>
(
"pad_value"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
rank
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
dims
().
size
();
std
::
vector
<
int
>
pads
(
rank
*
2
,
0
);
for
(
int
j
=
0
;
j
<
rank
;
++
j
)
{
pads
[
j
*
2
]
=
0
;
pads
[
j
*
2
+
1
]
=
static_cast
<
int
>
(
in_x
->
dims
()[
j
]
-
in_y
->
dims
()[
j
]);
}
math
::
PaddingFunctor
<
DeviceContext
,
T
>
(
rank
,
context
,
pads
,
pad_value
,
*
in_y
,
out
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
PadConstantLikeGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
in_y
=
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
in_dout
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_y
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
if
(
d_y
==
nullptr
)
{
return
;
}
if
(
in_dout
->
dims
()
==
in_y
->
dims
())
{
// TensorCopy(in_dout, context.GetPlace(), context, d_y);
d_y
->
ShareDataWith
(
*
in_dout
);
return
;
}
d_y
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
rank
=
in_dout
->
dims
().
size
();
std
::
vector
<
int
>
pads
(
static_cast
<
size_t
>
(
rank
)
*
2
,
0
);
for
(
int
j
=
0
;
j
<
rank
;
++
j
)
{
pads
[
j
*
2
]
=
0
;
pads
[
j
*
2
+
1
]
=
static_cast
<
int
>
(
in_dout
->
dims
()[
j
]
-
in_y
->
dims
()[
j
]);
}
math
::
PaddingGradFunctor
<
DeviceContext
,
T
>
(
rank
,
context
,
pads
,
*
in_dout
,
d_y
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/pad_op.h
浏览文件 @
b98b7440
...
@@ -18,117 +18,44 @@ limitations under the License. */
...
@@ -18,117 +18,44 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
template
<
typename
DeviceContext
,
typename
T
>
typename
IndexType
=
Eigen
::
DenseIndex
>
class
PadKernel
:
public
framework
::
OpKernel
<
T
>
{
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
PadFunction
(
const
framework
::
ExecutionContext
&
context
)
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
Eigen
::
array
<
std
::
pair
<
int
,
int
>
,
D
>
paddings
;
for
(
size_t
i
=
0
;
i
<
paddings
.
size
();
++
i
)
{
paddings
[
i
].
first
=
pads
[
i
*
2
];
paddings
[
i
].
second
=
pads
[
i
*
2
+
1
];
}
T
pad_value
=
context
.
Attr
<
T
>
(
"pad_value"
);
T
pad_value
=
context
.
Attr
<
T
>
(
"pad_value"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
x
);
int
rank
=
x
->
dims
().
size
();
auto
out_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
out
);
math
::
PaddingFunctor
<
DeviceContext
,
T
>
(
rank
,
context
,
pads
,
pad_value
,
*
x
,
auto
&
place
=
out
);
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
}
out_tensor
.
device
(
place
)
=
x_tensor
.
pad
(
paddings
,
pad_value
);
};
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
PadKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Pad
Grad
Kernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
int
rank
=
context
.
Input
<
Tensor
>
(
"X"
)
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
PadFunction
<
DeviceContext
,
T
,
1
>
(
context
);
break
;
case
2
:
PadFunction
<
DeviceContext
,
T
,
2
>
(
context
);
break
;
case
3
:
PadFunction
<
DeviceContext
,
T
,
3
>
(
context
);
break
;
case
4
:
PadFunction
<
DeviceContext
,
T
,
4
>
(
context
);
break
;
case
5
:
PadFunction
<
DeviceContext
,
T
,
5
>
(
context
);
break
;
case
6
:
PadFunction
<
DeviceContext
,
T
,
6
>
(
context
);
break
;
default:
PADDLE_THROW
(
"PadOp only support tensors with no more than 6 dimensions."
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
PadGradFunction
(
const
framework
::
ExecutionContext
&
context
)
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
Eigen
::
array
<
std
::
pair
<
int
,
int
>
,
D
>
paddings
;
for
(
size_t
i
=
0
;
i
<
paddings
.
size
();
++
i
)
{
paddings
[
i
].
first
=
-
pads
[
i
*
2
];
paddings
[
i
].
second
=
-
pads
[
i
*
2
+
1
];
}
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_x
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
if
(
d_x
!=
nullptr
)
{
if
(
d_x
==
nullptr
)
{
d_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
return
;
auto
d_x_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
d_x
);
auto
d_out_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
d_out
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
d_x_tensor
.
device
(
place
)
=
d_out_tensor
.
pad
(
paddings
,
0
);
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
d_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
class
PadGradKernel
:
public
framework
::
OpKernel
<
T
>
{
int
rank
=
d_out
->
dims
().
size
();
public:
math
::
PaddingGradFunctor
<
DeviceContext
,
T
>
(
rank
,
context
,
pads
,
*
d_out
,
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
d_x
);
size_t
rank
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
))
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
PadGradFunction
<
DeviceContext
,
T
,
1
>
(
context
);
break
;
case
2
:
PadGradFunction
<
DeviceContext
,
T
,
2
>
(
context
);
break
;
case
3
:
PadGradFunction
<
DeviceContext
,
T
,
3
>
(
context
);
break
;
case
4
:
PadGradFunction
<
DeviceContext
,
T
,
4
>
(
context
);
break
;
case
5
:
PadGradFunction
<
DeviceContext
,
T
,
5
>
(
context
);
break
;
case
6
:
PadGradFunction
<
DeviceContext
,
T
,
6
>
(
context
);
break
;
default:
PADDLE_THROW
(
"PadOp only support tensors with no more than 6 dimensions."
);
}
}
}
};
};
...
...
paddle/fluid/operators/parallel_do_op.cc
浏览文件 @
b98b7440
...
@@ -355,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
...
@@ -355,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
grad
->
SetInput
(
framework
::
GradVarName
(
output_param
),
og_names
);
grad
->
SetInput
(
framework
::
GradVarName
(
output_param
),
og_names
);
}
}
}
}
grad
->
SetInput
(
"Communicator"
,
{
"nccl_com__do_not_change_"
});
grad
->
SetAttrMap
(
this
->
Attrs
());
grad
->
SetAttrMap
(
this
->
Attrs
());
grad
->
SetBlockAttr
(
kParallelBlock
,
grad_block_
[
0
]);
grad
->
SetBlockAttr
(
kParallelBlock
,
grad_block_
[
0
]);
...
...
paddle/fluid/operators/print_op.cc
浏览文件 @
b98b7440
...
@@ -13,14 +13,12 @@
...
@@ -13,14 +13,12 @@
limitations under the License. */
limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <ctime>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
framework
::
GradVarName
;
#define CLOG std::cout
#define CLOG std::cout
...
@@ -35,7 +33,7 @@ struct Formater {
...
@@ -35,7 +33,7 @@ struct Formater {
std
::
type_index
dtype
{
typeid
(
const
char
)};
std
::
type_index
dtype
{
typeid
(
const
char
)};
framework
::
LoD
lod
;
framework
::
LoD
lod
;
int
summarize
;
int
summarize
;
void
*
data
{
nullptr
};
void
*
data
{
nullptr
};
void
operator
()(
size_t
size
)
{
void
operator
()(
size_t
size
)
{
PrintMessage
();
PrintMessage
();
...
@@ -101,7 +99,7 @@ struct Formater {
...
@@ -101,7 +99,7 @@ struct Formater {
template
<
typename
T
>
template
<
typename
T
>
void
Display
(
size_t
size
)
{
void
Display
(
size_t
size
)
{
auto
*
d
=
reinterpret_cast
<
T
*>
(
data
);
auto
*
d
=
reinterpret_cast
<
T
*>
(
data
);
CLOG
<<
"
\t
data: "
;
CLOG
<<
"
\t
data: "
;
if
(
summarize
!=
-
1
)
{
if
(
summarize
!=
-
1
)
{
summarize
=
std
::
min
(
size
,
(
size_t
)
summarize
);
summarize
=
std
::
min
(
size
,
(
size_t
)
summarize
);
...
@@ -120,51 +118,36 @@ struct Formater {
...
@@ -120,51 +118,36 @@ struct Formater {
// TODO(ChunweiYan) there should be some other printers for TensorArray
// TODO(ChunweiYan) there should be some other printers for TensorArray
class
TensorPrintOp
:
public
framework
::
OperatorBase
{
class
TensorPrintOp
:
public
framework
::
OperatorBase
{
public:
public:
TensorPrintOp
(
const
std
::
string
&
type
,
TensorPrintOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
TensorPrintOp
(
const
TensorPrintOp
&
o
)
TensorPrintOp
(
const
TensorPrintOp
&
o
)
:
framework
::
OperatorBase
(
:
framework
::
OperatorBase
(
static_cast
<
const
framework
::
OperatorBase
&>
(
o
))
{
static_cast
<
const
framework
::
OperatorBase
&>
(
o
))
{
PADDLE_THROW
(
"Not implemented."
);
PADDLE_THROW
(
"Not implemented."
);
}
}
private:
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
const
platform
::
Place
&
place
)
const
override
{
const
framework
::
Variable
*
in_var_ptr
=
nullptr
;
const
framework
::
Variable
*
in_var_ptr
=
nullptr
;
std
::
string
phase
(
kForward
);
std
::
string
printed_var_name
=
""
;
std
::
string
printed_var_name
=
""
;
auto
&
inputs
=
Inputs
();
if
(
inputs
.
find
(
"In"
)
!=
inputs
.
end
()
&&
!
Inputs
(
"In"
).
empty
())
{
in_var_ptr
=
scope
.
FindVar
(
Input
(
"In"
));
in_var_ptr
=
scope
.
FindVar
(
Input
(
"In"
));
printed_var_name
=
Inputs
(
"In"
).
front
();
printed_var_name
=
Inputs
(
"In"
).
front
();
}
else
if
(
inputs
.
find
(
"In@GRAD"
)
!=
inputs
.
end
()
&&
!
Inputs
(
"In@GRAD"
).
empty
())
{
in_var_ptr
=
scope
.
FindVar
(
Input
(
"In@GRAD"
));
printed_var_name
=
Inputs
(
"In@GRAD"
).
front
();
phase
=
std
::
string
(
kBackward
);
}
else
{
PADDLE_THROW
(
"Unknown phase, should be forward or backward."
);
}
PADDLE_ENFORCE_NOT_NULL
(
in_var_ptr
);
PADDLE_ENFORCE_NOT_NULL
(
in_var_ptr
);
auto
&
in_tensor
=
in_var_ptr
->
Get
<
framework
::
LoDTensor
>
();
auto
&
in_tensor
=
in_var_ptr
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out_var_ptr
=
scope
.
FindVar
(
Output
(
"Out"
));
auto
&
out_tensor
=
*
out_var_ptr
->
GetMutable
<
framework
::
LoDTensor
>
();
// Just copy data from input tensor to output tensor
// output tensor share same memory with input tensor
out_tensor
.
ShareDataWith
(
in_tensor
);
out_tensor
.
set_lod
(
in_tensor
.
lod
());
std
::
string
print_phase
=
Attr
<
std
::
string
>
(
"print_phase"
);
std
::
string
print_phase
=
Attr
<
std
::
string
>
(
"print_phase"
);
if
(
print_phase
!=
phase
&&
print_phase
!=
std
::
string
(
kBoth
))
{
bool
is_forward
=
Attr
<
bool
>
(
"is_forward"
);
if
((
is_forward
&&
print_phase
==
kBackward
)
||
(
!
is_forward
&&
print_phase
==
kForward
))
{
return
;
return
;
}
}
...
@@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase {
...
@@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase {
formater
.
dtype
=
printed_tensor
.
type
();
formater
.
dtype
=
printed_tensor
.
type
();
}
}
if
(
Attr
<
bool
>
(
"print_tensor_shape"
))
{
if
(
Attr
<
bool
>
(
"print_tensor_shape"
))
{
auto
&
dims
=
printed_tensor
.
dims
();
auto
&
dims
=
printed_tensor
.
dims
();
formater
.
dims
.
resize
(
dims
.
size
());
formater
.
dims
.
resize
(
dims
.
size
());
for
(
int
i
=
0
;
i
<
dims
.
size
();
++
i
)
formater
.
dims
[
i
]
=
dims
[
i
];
for
(
int
i
=
0
;
i
<
dims
.
size
();
++
i
)
formater
.
dims
[
i
]
=
dims
[
i
];
}
}
...
@@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase {
...
@@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase {
formater
.
lod
=
printed_tensor
.
lod
();
formater
.
lod
=
printed_tensor
.
lod
();
}
}
formater
.
summarize
=
Attr
<
int
>
(
"summarize"
);
formater
.
summarize
=
Attr
<
int
>
(
"summarize"
);
formater
.
data
=
reinterpret_cast
<
void
*>
(
printed_tensor
.
data
<
void
>
());
formater
.
data
=
reinterpret_cast
<
void
*>
(
printed_tensor
.
data
<
void
>
());
formater
(
printed_tensor
.
numel
());
formater
(
printed_tensor
.
numel
());
}
}
...
@@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
bool
>
(
"print_tensor_type"
,
"Whether to print the tensor's dtype."
);
AddAttr
<
bool
>
(
"print_tensor_type"
,
"Whether to print the tensor's dtype."
);
AddAttr
<
bool
>
(
"print_tensor_shape"
,
"Whether to print the tensor's shape."
);
AddAttr
<
bool
>
(
"print_tensor_shape"
,
"Whether to print the tensor's shape."
);
AddAttr
<
bool
>
(
"print_tensor_lod"
,
"Whether to print the tensor's lod."
);
AddAttr
<
bool
>
(
"print_tensor_lod"
,
"Whether to print the tensor's lod."
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"print_phase"
,
"print_phase"
,
"(string, default 'FORWARD') Which phase to display "
"(string, default 'BOTH') Which phase to display
including 'FORWARD' "
"
including 'FORWARD' "
"'BACKWARD' and 'BOTH'."
)
"'BACKWARD' and 'BOTH'."
)
.
SetDefault
(
std
::
string
(
kBoth
))
.
SetDefault
(
std
::
string
(
kBoth
))
.
InEnum
({
std
::
string
(
kForward
),
std
::
string
(
kBackward
),
.
InEnum
({
std
::
string
(
kForward
),
std
::
string
(
kBackward
),
std
::
string
(
kBoth
)});
std
::
string
(
kBoth
)});
Add
Output
(
"Out"
,
"Output tensor with same data as input tensor."
);
Add
Attr
<
bool
>
(
"is_forward"
,
"Whether is forward or not"
).
SetDefault
(
true
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Creates a print op that will print when a tensor is accessed.
Creates a print op that will print when a tensor is accessed.
...
@@ -238,40 +221,21 @@ tensor `t`.)DOC");
...
@@ -238,40 +221,21 @@ tensor `t`.)DOC");
class
InferShapeForward
:
public
framework
::
InferShapeBase
{
class
InferShapeForward
:
public
framework
::
InferShapeBase
{
public:
public:
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
PADDLE_ENFORCE
(
context
->
HasInput
(
"In"
),
"Input(In) should not be null."
);
PADDLE_ENFORCE
(
context
->
HasInput
(
"In"
),
"Input(In) should not be null."
);
context
->
ShareLoD
(
"In"
,
/*->*/
"Out"
);
context
->
SetOutputDim
(
"Out"
,
context
->
GetInputDim
(
"In"
));
}
};
class
InferShapeBackward
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
PADDLE_ENFORCE
(
context
->
HasInput
(
"In@GRAD"
),
"Input(In@GRAD) should not be null."
);
context
->
ShareLoD
(
"In@GRAD"
,
/*->*/
"Out"
);
context
->
SetOutputDim
(
"Out"
,
context
->
GetInputDim
(
"In@GRAD"
));
}
}
};
};
class
InferVarType
:
public
framework
::
VarTypeInference
{
class
PrintOpGradientMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{}
};
class
PrintOpProtoAndCheckGradOpMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op_desc_ptr
=
new
framework
::
OpDesc
();
auto
*
op_desc_ptr
=
new
framework
::
OpDesc
();
op_desc_ptr
->
SetType
(
"print_grad"
);
op_desc_ptr
->
SetType
(
"print"
);
op_desc_ptr
->
SetInput
(
"In@GRAD"
,
OutputGrad
(
"Out"
));
op_desc_ptr
->
SetInput
(
"In"
,
InputGrad
(
"In"
));
op_desc_ptr
->
SetOutput
(
"Out"
,
InputGrad
(
"In"
));
op_desc_ptr
->
SetAttrMap
(
Attrs
());
op_desc_ptr
->
SetAttrMap
(
Attrs
());
op_desc_ptr
->
SetAttr
(
"is_forward"
,
false
);
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op_desc_ptr
);
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op_desc_ptr
);
}
}
};
};
...
@@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker
...
@@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
print
,
ops
::
TensorPrintOp
,
ops
::
PrintOpProtoAndCheckMaker
,
REGISTER_OPERATOR
(
print
,
ops
::
TensorPrintOp
,
ops
::
PrintOpProtoAndCheckMaker
,
ops
::
PrintOpProtoAndCheckGradOpMaker
,
ops
::
InferShapeForward
,
ops
::
PrintOpGradientMaker
,
ops
::
InferShapeForward
);
ops
::
InferVarType
);
REGISTER_OPERATOR
(
print_grad
,
ops
::
TensorPrintOp
,
ops
::
InferShapeBackward
);
paddle/fluid/operators/scale_op.cc
浏览文件 @
b98b7440
...
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/operators/scale_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -52,6 +55,21 @@ $$Out = scale*X$$
...
@@ -52,6 +55,21 @@ $$Out = scale*X$$
}
}
};
};
class
ScaleOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
&
in_var_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
&
in_var
=
detail
::
Ref
(
block
->
FindVarRecursive
(
in_var_name
));
auto
out_var_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
*
out_var
=
block
->
FindVarRecursive
(
out_var_name
);
out_var
->
SetType
(
in_var
.
GetType
());
out_var
->
SetDataType
(
in_var
.
GetDataType
());
}
};
class
ScaleGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
class
ScaleGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
...
@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
...
@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
scale
,
ops
::
ScaleOp
,
ops
::
ScaleOpMaker
,
ops
::
ScaleGradMaker
);
REGISTER_OPERATOR
(
scale
,
ops
::
ScaleOp
,
ops
::
ScaleOpMaker
,
ops
::
ScaleGradMaker
,
ops
::
ScaleOpVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
scale
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
scale
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
...
...
paddle/fluid/operators/scale_op.h
浏览文件 @
b98b7440
...
@@ -22,17 +22,29 @@ namespace operators {
...
@@ -22,17 +22,29 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
ScaleKernel
:
public
framework
::
OpKernel
<
T
>
{
class
ScaleKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
{
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
tensor
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
in_var
=
ctx
.
InputVar
(
"X"
);
auto
*
in
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
tensor
->
mutable_data
<
T
>
(
in
->
place
());
auto
scale
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"scale"
));
auto
*
out_var
=
ctx
.
OutputVar
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
in
->
place
());
auto
eigen_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
tensor
);
PADDLE_ENFORCE_EQ
(
in
->
dims
(),
out
->
dims
(),
"in and out should have the same dim"
);
auto
scale
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"scale"
));
if
(
in_var
->
IsType
<
framework
::
SelectedRows
>
()
&&
in_var
!=
out_var
)
{
auto
&
in_slr
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
auto
*
out_slr
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
out_slr
->
set_rows
(
in_slr
.
rows
());
out_slr
->
set_height
(
in_slr
.
height
());
}
auto
eigen_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
eigen_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
auto
eigen_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
auto
&
dev
=
auto
&
dev
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
eigen_out
.
device
(
dev
)
=
scale
*
eigen_in
;
eigen_out
.
device
(
dev
)
=
scale
*
eigen_in
;
}
}
};
};
...
...
paddle/fluid/operators/send_barrier_op.cc
浏览文件 @
b98b7440
...
@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
...
@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
class
SendBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
SendBarrierOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
{
void
Make
()
{
AddInput
(
"X"
,
"(Any) Dummy inputs, used for control dependency"
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Any) Dummy outputs, used for control dependency"
)
.
AsDuplicable
();
AddComment
(
R"DOC(
AddComment
(
R"DOC(
SendBarrier operator
SendBarrier operator
...
...
paddle/fluid/operators/sequence_pad_op.cc
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace
paddle
{
namespace
operators
{
class
SequencePadOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequencePadOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PadValue"
),
"Input(PadValue) of SequencePadOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of SequencePadOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
"The rank of Input(x) can't be less than 2."
);
auto
time_step_dims
=
framework
::
slice_ddim
(
x_dims
,
1
,
x_dims
.
size
());
auto
pad_value_dims
=
ctx
->
GetInputDim
(
"PadValue"
);
PADDLE_ENFORCE
(
pad_value_dims
==
framework
::
make_ddim
({
1
})
||
pad_value_dims
==
time_step_dims
,
"The Input(PadValue) must be a scalar or a tensor whose "
"shape equals to time steps in sequences"
);
int
out_dim_0
=
-
1
;
int
out_dim_1
=
-
1
;
if
(
ctx
->
IsRuntime
())
{
// run time
framework
::
Variable
*
x_var
=
boost
::
get
<
framework
::
Variable
*>
(
ctx
->
GetInputVarPtrs
(
"X"
)[
0
]);
const
auto
&
x_lod
=
x_var
->
Get
<
LoDTensor
>
().
lod
();
PADDLE_ENFORCE
(
!
x_lod
.
empty
(),
"The Input(X) must hold lod info."
);
const
auto
&
x_lod_0
=
x_lod
[
0
];
PADDLE_ENFORCE_GE
(
x_lod_0
.
size
(),
2
,
"The Input(X)'s lod info is corrupted."
);
PADDLE_ENFORCE_EQ
(
x_dims
[
0
],
static_cast
<
int64_t
>
(
x_lod_0
.
back
()),
"The Input(X)'s lod info mismatches the actual tensor shape."
);
int
seq_num
=
x_lod_0
.
size
()
-
1
;
int
max_seq_len
=
math
::
MaximumSequenceLength
(
x_lod_0
);
int
padded_length
=
ctx
->
Attrs
().
Get
<
int
>
(
"padded_length"
);
if
(
padded_length
==
-
1
)
{
padded_length
=
max_seq_len
;
}
PADDLE_ENFORCE_GE
(
padded_length
,
max_seq_len
,
"The Attr(padded_length) must be -1 or an int greater "
"than the length of the longest original sequence."
);
out_dim_0
=
seq_num
;
out_dim_1
=
padded_length
;
}
else
{
// compile time
framework
::
VarDesc
*
x_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"X"
)[
0
]);
PADDLE_ENFORCE_GE
(
x_desc
->
GetLoDLevel
(),
1
);
}
std
::
vector
<
int
>
out_dims_vec
{
out_dim_0
,
out_dim_1
};
auto
time_step_dims_vec
=
framework
::
vectorize2int
(
time_step_dims
);
out_dims_vec
.
insert
(
out_dims_vec
.
end
(),
time_step_dims_vec
.
begin
(),
time_step_dims_vec
.
end
());
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims_vec
));
}
};
class
SequencePadOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(LoDTensor, default LoDTensor<float>) Input variable which "
"should contain lod information."
);
AddInput
(
"PadValue"
,
"(LoDTensor), this Tensor holds values that will be fill into "
"padded steps. It can be a scalar or a tensor whose shape equals "
"to time steps in sequences. If it's a scalar, it will be "
"automatically broadcasted to the shape of time step."
);
AddOutput
(
"Out"
,
"(LoDTensor) The output vairable, which contains padded sequences."
);
AddAttr
<
int
>
(
"padded_length"
,
"The length of padded sequences. It can be setted to -1 or "
"any positive int. When it is -1, all sequences will be padded up to "
"the length of the longest one among them; when it a certain positive "
"value, it must be greater than the length of the longest original "
"sequence."
)
.
SetDefault
(
-
1
);
AddComment
(
R"DOC(
Sequence Pad Operator
This operator pads sequences in a same batch to a consistent length.
The length is specified by attribute 'padded_length'. New elements,
whose values are specified by input 'PadValue', will be appended to
the end of each sequence, to make their final lengths consistent.
Following are cases to better explain how this works:
Case 1:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [a, b, c, d, e]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = 4,
then we get LoDTensor:
Out.data = [[a, b, 0, 0],
[c, d, e, 0]]
Case 2:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [0]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [0, 0]],
[[c1, c2], [d1, d2], [e1, e2]]]
Case 3:
Given a 1-level LoDTensor input(X):
X.lod = [[0, 2, 5]]
X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
and Input(PadValue):
PadValue.data = [p1, p2]
and attribite 'padded_length' = -1, which mean using the length
of longest input sequence(3 in this case),
then we get LoDTensor:
Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
[[c1, c2], [d1, d2], [e1, e2]]]
)DOC"
);
}
};
class
SequencePadGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequencePadGradOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) of SequencePadGradOp should not be null."
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sequence_pad
,
ops
::
SequencePadOp
,
ops
::
SequencePadOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
sequence_pad_grad
,
ops
::
SequencePadGradOp
);
REGISTER_OP_CPU_KERNEL
(
sequence_pad
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
sequence_pad_grad
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/sequence_pad_op.cu
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_pad_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sequence_pad
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SequencePadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
sequence_pad_grad
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SequencePadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/sequence_pad_op.h
0 → 100644
浏览文件 @
b98b7440
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
LoD
=
framework
::
LoD
;
template
<
typename
DeviceContext
,
typename
T
>
class
SequencePadOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
*
pad_value
=
ctx
.
Input
<
LoDTensor
>
(
"PadValue"
);
int
padded_length
=
ctx
.
Attr
<
int
>
(
"padded_length"
);
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
*
x
,
out
,
*
pad_value
,
padded_length
,
0
,
false
,
math
::
kBatchLengthWidth
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
SequencePadGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
if
(
d_x
)
{
const
auto
*
d_out
=
ctx
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
padded_length
=
ctx
.
Attr
<
int
>
(
"padded_length"
);
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
*
d_out
,
d_x
,
padded_length
,
0
,
false
,
math
::
kBatchLengthWidth
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/unstack_op.cc
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/unstack_op.h"
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
USE_OP
(
stack
);
REGISTER_OPERATOR
(
unstack
,
ops
::
UnStackOp
,
ops
::
UnStackOpMaker
,
ops
::
UnStackOpInferShape
,
ops
::
UnStackGradOpDescMaker
);
REGISTER_OPERATOR
(
unstack_grad
,
ops
::
UnStackGradOp
,
ops
::
UnStackOpGradInferShape
);
paddle/fluid/operators/unstack_op.h
0 → 100644
浏览文件 @
b98b7440
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
class
UnStackOpInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must exist."
);
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
int
num
=
ctx
->
Attrs
().
Get
<
int
>
(
"num"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
int
rank
=
x_dim
.
size
();
PADDLE_ENFORCE
(
axis
>=
-
rank
&&
axis
<
rank
,
"Attr(axis) must be inside [-rank, rank), where rank = %d"
,
rank
);
if
(
axis
<
0
)
axis
+=
rank
;
PADDLE_ENFORCE_EQ
(
ctx
->
Outputs
(
"Y"
).
size
(),
static_cast
<
size_t
>
(
num
),
"Number of Outputs(Y) is wrong"
);
if
(
x_dim
[
axis
]
>
0
)
{
PADDLE_ENFORCE_EQ
(
num
,
x_dim
[
axis
],
"Number of Outputs(Y) is wrong"
);
}
auto
vec
=
framework
::
vectorize2int
(
x_dim
);
vec
.
erase
(
vec
.
begin
()
+
axis
);
ctx
->
SetOutputsDim
(
"Y"
,
std
::
vector
<
framework
::
DDim
>
(
// NOLINT
x_dim
[
axis
],
framework
::
make_ddim
(
vec
)));
}
};
class
UnStackOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input of unstack op."
);
AddOutput
(
"Y"
,
"The output of unstack op."
).
AsDuplicable
();
AddAttr
<
int
>
(
"axis"
,
"The axis along which Input(X) should be unstacked."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"num"
,
"The number of outputs(Y)."
).
GreaterThan
(
0
);
AddComment
(
R"DOC(
UnStack Operator.
UnStack Input(X) into several tensors along Attr(axis).
)DOC"
);
}
};
class
UnStackOp
:
public
framework
::
OperatorBase
{
public:
using
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
stack_grad_op
=
framework
::
OpRegistry
::
CreateOp
(
"stack_grad"
,
{{
framework
::
GradVarName
(
"Y"
),
{
Input
(
"X"
)}}},
{{
framework
::
GradVarName
(
"X"
),
Outputs
(
"Y"
)}},
Attrs
());
stack_grad_op
->
Run
(
scope
,
place
);
}
};
class
UnStackOpGradInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_GT
(
ctx
->
Inputs
(
framework
::
GradVarName
(
"Y"
)).
size
(),
0
,
"Number of Inputs(Y@Grad) must be larger than 0"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"Output(X@Grad) must exist."
);
auto
input_dims
=
ctx
->
GetInputsDim
(
framework
::
GradVarName
(
"Y"
));
for
(
size_t
i
=
1
;
i
<
input_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
input_dims
[
i
],
input_dims
[
0
],
"Dims of all Inputs(Y@Grad) must be the same"
);
}
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
int
rank
=
input_dims
[
0
].
size
();
PADDLE_ENFORCE
(
axis
>=
-
(
rank
+
1
)
&&
axis
<
rank
+
1
,
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d"
,
rank
);
if
(
axis
<
0
)
axis
+=
(
rank
+
1
);
auto
vec
=
framework
::
vectorize2int
(
input_dims
[
0
]);
vec
.
insert
(
vec
.
begin
()
+
axis
,
input_dims
.
size
());
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
framework
::
make_ddim
(
vec
));
}
};
class
UnStackGradOpDescMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
std
::
unique_ptr
<
framework
::
OpDesc
>
op
(
new
framework
::
OpDesc
());
op
->
SetType
(
"unstack_grad"
);
op
->
SetInput
(
framework
::
GradVarName
(
"Y"
),
OutputGrad
(
"Y"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
op
->
SetAttrMap
(
Attrs
());
return
op
;
}
};
class
UnStackGradOp
:
public
framework
::
OperatorBase
{
public:
using
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
stack_op
=
framework
::
OpRegistry
::
CreateOp
(
"stack"
,
{{
"X"
,
Inputs
(
framework
::
GradVarName
(
"Y"
))}},
{{
"Y"
,
{
Output
(
framework
::
GradVarName
(
"X"
))}}},
Attrs
());
stack_op
->
Run
(
scope
,
place
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/warpctc_op.h
浏览文件 @
b98b7440
...
@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
...
@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
num_sequences
),
1
});
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
num_sequences
),
1
});
// warpctc needs sequences data stored in transposed padding format
// warpctc needs sequences data stored in transposed padding format
Tensor
warpctc_logits
;
LoD
Tensor
warpctc_logits
;
const
size_t
max_sequence_length
=
const
size_t
max_sequence_length
=
math
::
MaximumSequenceLength
(
logits_lod
,
level
);
math
::
MaximumSequenceLength
(
logits_lod
[
level
]
);
auto
warpctc_logits_dims
=
auto
warpctc_logits_dims
=
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
max_sequence_length
),
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
max_sequence_length
),
static_cast
<
int64_t
>
(
num_sequences
),
static_cast
<
int64_t
>
(
num_sequences
),
static_cast
<
int64_t
>
(
sequence_width
)});
static_cast
<
int64_t
>
(
sequence_width
)});
warpctc_logits
.
mutable_data
<
T
>
(
warpctc_logits_dims
,
ctx
.
GetPlace
());
warpctc_logits
.
mutable_data
<
T
>
(
warpctc_logits_dims
,
ctx
.
GetPlace
());
LoDTensor
cpu_pad_value
;
T
*
pad_value_data
=
cpu_pad_value
.
mutable_data
<
T
>
({
1
},
platform
::
CPUPlace
());
*
pad_value_data
=
static_cast
<
T
>
(
0
);
LoDTensor
pad_value
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
pad_value
=
cpu_pad_value
;
}
else
{
TensorCopySync
(
cpu_pad_value
,
ctx
.
GetPlace
(),
&
pad_value
);
}
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
*
logits
,
&
warpctc_logits
,
ctx
.
template
device_context
<
DeviceContext
>(),
*
logits
,
&
warpctc_logits
,
false
);
pad_value
,
-
1
,
0
,
false
/* norm_by_times */
,
math
::
kLengthBatchWidth
);
const
T
*
warpctc_logits_data
=
warpctc_logits
.
data
<
T
>
();
const
T
*
warpctc_logits_data
=
warpctc_logits
.
data
<
T
>
();
std
::
vector
<
int
>
warpctc_label_lengths
(
num_sequences
);
std
::
vector
<
int
>
warpctc_label_lengths
(
num_sequences
);
...
@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
...
@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
class
WarpCTCGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
WarpCTCGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
warpctc_grad
=
ctx
.
Input
<
Tensor
>
(
"WarpCTCGrad"
);
auto
*
warpctc_grad
=
ctx
.
Input
<
LoD
Tensor
>
(
"WarpCTCGrad"
);
auto
*
logits_grad
=
ctx
.
Output
<
LoDTensor
>
(
framework
::
GradVarName
(
"Logits"
));
auto
*
logits_grad
=
ctx
.
Output
<
LoDTensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
loss_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Loss"
));
const
Tensor
*
loss_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Loss"
));
logits_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
logits_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
norm_by_times
=
ctx
.
Attr
<
bool
>
(
"norm_by_times"
);
bool
norm_by_times
=
ctx
.
Attr
<
bool
>
(
"norm_by_times"
);
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
logits
_grad
,
ctx
.
template
device_context
<
DeviceContext
>(),
*
warpctc
_grad
,
*
warpctc_grad
,
norm_by_times
);
logits_grad
,
-
1
,
0
,
norm_by_times
,
math
::
kLengthBatchWidth
);
const
T
*
loss_grad_data
=
loss_grad
->
data
<
T
>
();
const
T
*
loss_grad_data
=
loss_grad
->
data
<
T
>
();
math
::
ScaleLoDTensorFunctor
<
DeviceContext
,
T
>
()(
math
::
ScaleLoDTensorFunctor
<
DeviceContext
,
T
>
()(
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
b98b7440
if
(
NOT WIN32
)
proto_library
(
profiler_proto SRCS profiler.proto DEPS framework_proto
)
proto_library
(
profiler_proto SRCS profiler.proto DEPS framework_proto
)
py_proto_compile
(
profiler_py_proto SRCS profiler.proto
)
py_proto_compile
(
profiler_py_proto SRCS profiler.proto
)
...
@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
...
@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND cp *.py
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler
COMMAND cp *.py
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/profiler
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
enforce SRCS enforce.cc
)
nv_library
(
enforce SRCS enforce.cc
)
...
@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
...
@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
if
(
NOT WIN32
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context device_tracer
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context device_tracer
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
endif
(
NOT WIN32
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
b98b7440
...
@@ -22,9 +22,13 @@ limitations under the License. */
...
@@ -22,9 +22,13 @@ limitations under the License. */
#ifdef __APPLE__
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#else
#else
#include <unistd.h>
#include <unistd.h>
#endif
#endif
// _WIN32
#include <algorithm>
#include <algorithm>
#include "gflags/gflags.h"
#include "gflags/gflags.h"
...
@@ -32,16 +36,20 @@ limitations under the License. */
...
@@ -32,16 +36,20 @@ limitations under the License. */
DEFINE_double
(
fraction_of_cpu_memory_to_use
,
1
,
DEFINE_double
(
fraction_of_cpu_memory_to_use
,
1
,
"Default use 100% of CPU memory for PaddlePaddle,"
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
"reserve the rest for page tables, etc"
);
#if !defined(_WIN32)
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/
* to obtain the best performance*/
5000
,
5000
ul
,
#else
#else
500
,
500
ul
,
#endif
#endif
"Initial CPU memory for PaddlePaddle, in MD unit."
);
"Initial CPU memory for PaddlePaddle, in MD unit."
);
#else
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
500ul
,
"Initial CPU memory for PaddlePaddle, in MD unit."
);
#endif // !defined(_WIN32)
DEFINE_double
(
DEFINE_double
(
fraction_of_cuda_pinned_memory_to_use
,
0.5
,
fraction_of_cuda_pinned_memory_to_use
,
0.5
,
...
@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
...
@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
size_t
len
=
sizeof
(
size
);
size_t
len
=
sizeof
(
size
);
if
(
sysctl
(
mib
,
2
,
&
size
,
&
len
,
NULL
,
0
)
==
0
)
return
(
size_t
)
size
;
if
(
sysctl
(
mib
,
2
,
&
size
,
&
len
,
NULL
,
0
)
==
0
)
return
(
size_t
)
size
;
return
0L
;
return
0L
;
#elif defined(_WIN32)
MEMORYSTATUSEX
sMeminfo
;
sMeminfo
.
dwLength
=
sizeof
(
sMeminfo
);
GlobalMemoryStatusEx
(
&
sMeminfo
);
return
sMeminfo
.
ullTotalPhys
;
#else
#else
int64_t
pages
=
sysconf
(
_SC_PHYS_PAGES
);
int64_t
pages
=
sysconf
(
_SC_PHYS_PAGES
);
int64_t
page_size
=
sysconf
(
_SC_PAGE_SIZE
);
int64_t
page_size
=
sysconf
(
_SC_PAGE_SIZE
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
b98b7440
...
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#if !defined(_WIN32)
#include <sys/time.h>
#include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h>
#include <time.h>
#include <chrono> // NOLINT
#include <chrono> // NOLINT
#include <string>
#include <string>
...
@@ -27,12 +32,15 @@ namespace platform {
...
@@ -27,12 +32,15 @@ namespace platform {
///////////////////////
///////////////////////
// WARN: Under Development. Don't depend on it yet.
// WARN: Under Development. Don't depend on it yet.
//////////////////////
//////////////////////
#if !defined(_WIN32)
inline
uint64_t
PosixInNsec
()
{
inline
uint64_t
PosixInNsec
()
{
struct
timeval
tv
;
struct
timeval
tv
;
gettimeofday
(
&
tv
,
nullptr
);
gettimeofday
(
&
tv
,
nullptr
);
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
}
#else
inline
uint64_t
PosixInNsec
()
{
return
static_cast
<
uint64_t
>
(
0
);
}
#endif // !_WIN32
// DeviceTracer performs the following tasks:
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
...
...
paddle/fluid/platform/dynload/CMakeLists.txt
浏览文件 @
b98b7440
...
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
...
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
list
(
APPEND CUDA_SRCS cupti.cc
)
list
(
APPEND CUDA_SRCS cupti.cc
)
endif
(
CUPTI_FOUND
)
endif
(
CUPTI_FOUND
)
nv_library
(
dynload_cuda SRCS
${
CUDA_SRCS
}
DEPS dynamic_loader
)
nv_library
(
dynload_cuda SRCS
${
CUDA_SRCS
}
DEPS dynamic_loader
)
if
(
NOT WIN32
)
cc_library
(
dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc
)
cc_library
(
dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc
)
endif
(
NOT WIN32
)
if
(
WITH_MKLML
)
if
(
WITH_MKLML
)
cc_library
(
dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml
)
cc_library
(
dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml
)
endif
()
endif
()
...
...
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
b98b7440
...
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <memory>
#include <mutex> // NOLINT
#include <mutex> // NOLINT
#include <string>
#include <string>
...
@@ -23,6 +21,7 @@ limitations under the License. */
...
@@ -23,6 +21,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
DEFINE_string
(
cudnn_dir
,
""
,
DEFINE_string
(
cudnn_dir
,
""
,
"Specify path for loading libcudnn.so. For instance, "
"Specify path for loading libcudnn.so. For instance, "
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
b98b7440
...
@@ -18,6 +18,11 @@ limitations under the License. */
...
@@ -18,6 +18,11 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#endif
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cublas_v2.h>
#include <cudnn.h>
#include <cudnn.h>
...
@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
...
@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
// always forces branch prediction of true.
// always forces branch prediction of true.
// This generates faster binary code. __builtin_expect is since C++11.
// This generates faster binary code. __builtin_expect is since C++11.
// For more details, please check https://stackoverflow.com/a/43870188/724872.
// For more details, please check https://stackoverflow.com/a/43870188/724872.
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0)
#endif
template
<
typename
...
Args
>
template
<
typename
...
Args
>
inline
typename
std
::
enable_if
<
sizeof
...(
Args
)
!=
0
,
void
>::
type
throw_on_error
(
inline
typename
std
::
enable_if
<
sizeof
...(
Args
)
!=
0
,
void
>::
type
throw_on_error
(
...
@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
...
@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
throw_on_error
(
e
,
""
);
throw_on_error
(
e
,
""
);
}
}
#if !defined(_WIN32)
#define PADDLE_THROW(...) \
#define PADDLE_THROW(...) \
do { \
do { \
throw ::paddle::platform::EnforceNotMet( \
throw ::paddle::platform::EnforceNotMet( \
...
@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
...
@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \
__FILE__, __LINE__); \
} \
} \
} while (false)
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
#define PADDLE_THROW_EOF() \
#define PADDLE_THROW_EOF() \
do { \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
__LINE__); \
} while (false)
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
/*
/*
* Some enforce helpers here, usage:
* Some enforce helpers here, usage:
* int a = 1;
* int a = 1;
...
...
paddle/fluid/platform/init.cc
浏览文件 @
b98b7440
...
@@ -85,9 +85,6 @@ void InitDevices(bool init_p2p) {
...
@@ -85,9 +85,6 @@ void InitDevices(bool init_p2p) {
}
catch
(
const
std
::
exception
&
exp
)
{
}
catch
(
const
std
::
exception
&
exp
)
{
LOG
(
WARNING
)
<<
"Compiled with WITH_GPU, but no GPU found in runtime."
;
LOG
(
WARNING
)
<<
"Compiled with WITH_GPU, but no GPU found in runtime."
;
}
}
#else
LOG
(
WARNING
)
<<
"'CUDA' is not supported, Please re-compile with WITH_GPU option"
;
#endif
#endif
InitDevices
(
init_p2p
,
devices
);
InitDevices
(
init_p2p
,
devices
);
}
}
...
@@ -101,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
...
@@ -101,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
}
catch
(
const
std
::
exception
&
exp
)
{
}
catch
(
const
std
::
exception
&
exp
)
{
LOG
(
WARNING
)
<<
"Compiled with WITH_GPU, but no GPU found in runtime."
;
LOG
(
WARNING
)
<<
"Compiled with WITH_GPU, but no GPU found in runtime."
;
}
}
#else
LOG
(
WARNING
)
<<
"'CUDA' is not supported, Please re-compile with WITH_GPU option"
;
#endif
#endif
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
{
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
b98b7440
...
@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
...
@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
#if !defined(_WIN32)
struct
RecordEvent
{
struct
RecordEvent
{
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
...
@@ -94,6 +95,15 @@ struct RecordBlock {
...
@@ -94,6 +95,15 @@ struct RecordBlock {
std
::
string
name_
;
std
::
string
name_
;
uint64_t
start_ns_
;
uint64_t
start_ns_
;
};
};
#else
// windows do not support profiler temporarily.
struct
RecordEvent
{
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{}
};
struct
RecordBlock
{
explicit
RecordBlock
(
int
block_id
)
{}
};
#endif
// Return the event list of all threads. Assumed the returned value calls
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
b98b7440
...
@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) {
...
@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) {
pybind11
::
enum_
<
pd
::
proto
::
VarType
::
Type
>
(
var_desc
,
"VarType"
,
""
)
pybind11
::
enum_
<
pd
::
proto
::
VarType
::
Type
>
(
var_desc
,
"VarType"
,
""
)
.
value
(
"BOOL"
,
pd
::
proto
::
VarType
::
BOOL
)
.
value
(
"BOOL"
,
pd
::
proto
::
VarType
::
BOOL
)
.
value
(
"UINT8"
,
pd
::
proto
::
VarType
::
UINT8
)
.
value
(
"UINT8"
,
pd
::
proto
::
VarType
::
UINT8
)
.
value
(
"INT8"
,
pd
::
proto
::
VarType
::
INT8
)
.
value
(
"INT16"
,
pd
::
proto
::
VarType
::
INT16
)
.
value
(
"INT16"
,
pd
::
proto
::
VarType
::
INT16
)
.
value
(
"INT32"
,
pd
::
proto
::
VarType
::
INT32
)
.
value
(
"INT32"
,
pd
::
proto
::
VarType
::
INT32
)
.
value
(
"INT64"
,
pd
::
proto
::
VarType
::
INT64
)
.
value
(
"INT64"
,
pd
::
proto
::
VarType
::
INT64
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b98b7440
...
@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) {
...
@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) {
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCPUTensorSetFromArray
<
int8_t
>
)
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
float
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
float
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
int
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
int
>
)
...
@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) {
...
@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) {
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCUDATensorSetFromArray
<
int8_t
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
float
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
float
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
int
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
int
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
double
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
double
>
)
...
@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) {
...
@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) {
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
bool
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
uint16_t
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
uint8_t
>
)
.
def
(
"set"
,
PyCUDAPinnedTensorSetFromArray
<
int8_t
>
)
#endif
#endif
.
def
(
"shape"
,
[](
Tensor
&
self
)
{
return
vectorize
(
self
.
dims
());
})
.
def
(
"shape"
,
[](
Tensor
&
self
)
{
return
vectorize
(
self
.
dims
());
})
.
def
(
"_set_float_element"
,
TensorSetElement
<
float
>
)
.
def
(
"_set_float_element"
,
TensorSetElement
<
float
>
)
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
b98b7440
...
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
...
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline
pybind11
::
buffer_info
CastToPyBuffer
(
const
framework
::
Tensor
&
tensor
)
{
inline
pybind11
::
buffer_info
CastToPyBuffer
(
const
framework
::
Tensor
&
tensor
)
{
auto
buffer_info
=
auto
buffer_info
=
details
::
CastToPyBufferImpl
<
true
,
0
,
float
,
int
,
double
,
int64_t
,
bool
,
details
::
CastToPyBufferImpl
<
true
,
0
,
float
,
int
,
double
,
int64_t
,
bool
,
uint8_t
,
platform
::
float16
>
()(
tensor
);
uint8_t
,
int8_t
,
platform
::
float16
>
()(
tensor
);
return
buffer_info
;
return
buffer_info
;
}
}
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
b98b7440
...
@@ -335,12 +335,18 @@ function assert_api_not_changed() {
...
@@ -335,12 +335,18 @@ function assert_api_not_changed() {
fi
fi
python
${
PADDLE_ROOT
}
/tools/diff_api.py
${
PADDLE_ROOT
}
/paddle/fluid/API.spec new.spec
python
${
PADDLE_ROOT
}
/tools/diff_api.py
${
PADDLE_ROOT
}
/paddle/fluid/API.spec new.spec
deactivate
deactivate
}
function
assert_api_spec_approvals
()
{
if
[
-z
${
BRANCH
}
]
;
then
BRANCH
=
"develop"
fi
API_CHANGE
=
`
git diff
--name-only
upstream/
develop
|
grep
"paddle/fluid/API.spec"
||
true
`
API_CHANGE
=
`
git diff
--name-only
upstream/
$BRANCH
|
grep
"paddle/fluid/API.spec"
||
true
`
echo
"checking API.spec change, PR:
${
GIT_PR_ID
}
, changes:
${
API_CHANGE
}
"
echo
"checking API.spec change, PR:
${
GIT_PR_ID
}
, changes:
${
API_CHANGE
}
"
if
[
${
API_CHANGE
}
]
&&
[
"
${
GIT_PR_ID
}
"
!=
""
]
;
then
if
[
${
API_CHANGE
}
]
&&
[
"
${
GIT_PR_ID
}
"
!=
""
]
;
then
#
TODO: curl -H 'Authorization: token ${TOKEN}'
#
NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews |
\
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews
?per_page
=
10000
|
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433
`
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433
`
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
...
@@ -622,11 +628,12 @@ function main() {
...
@@ -622,11 +628,12 @@ function main() {
cicheck
)
cicheck
)
cmake_gen
${
PYTHON_ABI
:-
""
}
cmake_gen
${
PYTHON_ABI
:-
""
}
build
build
assert_api_not_changed
${
PYTHON_ABI
:-
""
}
run_test
run_test
gen_capi_package
gen_capi_package
gen_fluid_inference_lib
gen_fluid_inference_lib
test_fluid_inference_lib
test_fluid_inference_lib
assert_api_
not_changed
${
PYTHON_ABI
:-
""
}
assert_api_
spec_approvals
;;
;;
*
)
*
)
print_usage
print_usage
...
...
python/paddle/dataset/image.py
浏览文件 @
b98b7440
...
@@ -203,7 +203,7 @@ def resize_short(im, size):
...
@@ -203,7 +203,7 @@ def resize_short(im, size):
h_new
=
size
*
h
//
w
h_new
=
size
*
h
//
w
else
:
else
:
w_new
=
size
*
w
//
h
w_new
=
size
*
w
//
h
im
=
cv2
.
resize
(
im
,
(
h_new
,
w
_new
),
interpolation
=
cv2
.
INTER_CUBIC
)
im
=
cv2
.
resize
(
im
,
(
w_new
,
h
_new
),
interpolation
=
cv2
.
INTER_CUBIC
)
return
im
return
im
...
@@ -345,7 +345,6 @@ def simple_transform(im,
...
@@ -345,7 +345,6 @@ def simple_transform(im,
if
np
.
random
.
randint
(
2
)
==
0
:
if
np
.
random
.
randint
(
2
)
==
0
:
im
=
left_right_flip
(
im
,
is_color
)
im
=
left_right_flip
(
im
,
is_color
)
else
:
else
:
im
=
center_crop
(
im
,
crop_size
,
is_color
)
im
=
center_crop
(
im
,
crop_size
,
is_color
=
is_color
)
im
=
center_crop
(
im
,
crop_size
,
is_color
=
is_color
)
if
len
(
im
.
shape
)
==
3
:
if
len
(
im
.
shape
)
==
3
:
im
=
to_chw
(
im
)
im
=
to_chw
(
im
)
...
...
python/paddle/dataset/movielens.py
浏览文件 @
b98b7440
...
@@ -24,6 +24,7 @@ set and test set into paddle reader creators.
...
@@ -24,6 +24,7 @@ set and test set into paddle reader creators.
from
__future__
import
print_function
from
__future__
import
print_function
import
numpy
as
np
import
zipfile
import
zipfile
import
paddle.dataset.common
import
paddle.dataset.common
import
re
import
re
...
@@ -150,12 +151,12 @@ def __initialize_meta_info__():
...
@@ -150,12 +151,12 @@ def __initialize_meta_info__():
def
__reader__
(
rand_seed
=
0
,
test_ratio
=
0.1
,
is_test
=
False
):
def
__reader__
(
rand_seed
=
0
,
test_ratio
=
0.1
,
is_test
=
False
):
fn
=
__initialize_meta_info__
()
fn
=
__initialize_meta_info__
()
rand
=
random
.
Random
(
x
=
rand_seed
)
np
.
random
.
seed
(
rand_seed
)
with
zipfile
.
ZipFile
(
file
=
fn
)
as
package
:
with
zipfile
.
ZipFile
(
file
=
fn
)
as
package
:
with
package
.
open
(
'ml-1m/ratings.dat'
)
as
rating
:
with
package
.
open
(
'ml-1m/ratings.dat'
)
as
rating
:
for
line
in
rating
:
for
line
in
rating
:
line
=
cpt
.
to_text
(
line
,
encoding
=
'latin'
)
line
=
cpt
.
to_text
(
line
,
encoding
=
'latin'
)
if
(
rand
.
random
()
<
test_ratio
)
==
is_test
:
if
(
np
.
random
.
random
()
<
test_ratio
)
==
is_test
:
uid
,
mov_id
,
rating
,
_
=
line
.
strip
().
split
(
"::"
)
uid
,
mov_id
,
rating
,
_
=
line
.
strip
().
split
(
"::"
)
uid
=
int
(
uid
)
uid
=
int
(
uid
)
mov_id
=
int
(
mov_id
)
mov_id
=
int
(
mov_id
)
...
...
python/paddle/fluid/framework.py
浏览文件 @
b98b7440
...
@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
...
@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return
core
.
VarDesc
.
VarType
.
INT16
return
core
.
VarDesc
.
VarType
.
INT16
elif
dtype
==
np
.
uint8
:
elif
dtype
==
np
.
uint8
:
return
core
.
VarDesc
.
VarType
.
UINT8
return
core
.
VarDesc
.
VarType
.
UINT8
elif
dtype
==
np
.
int8
:
return
core
.
VarDesc
.
VarType
.
INT8
else
:
else
:
raise
ValueError
(
"Not supported numpy dtype %s"
%
dtype
)
raise
ValueError
(
"Not supported numpy dtype %s"
%
dtype
)
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
b98b7440
...
@@ -189,7 +189,6 @@ def Print(input,
...
@@ -189,7 +189,6 @@ def Print(input,
message="The content of some_layer: ")
message="The content of some_layer: ")
'''
'''
helper
=
LayerHelper
(
'print'
,
**
locals
())
helper
=
LayerHelper
(
'print'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
helper
.
input_dtype
())
helper
.
append_op
(
helper
.
append_op
(
type
=
'print'
,
type
=
'print'
,
inputs
=
{
'In'
:
input
},
inputs
=
{
'In'
:
input
},
...
@@ -202,9 +201,7 @@ def Print(input,
...
@@ -202,9 +201,7 @@ def Print(input,
'print_tensor_shape'
:
print_tensor_shape
,
'print_tensor_shape'
:
print_tensor_shape
,
'print_tensor_lod'
:
print_tensor_lod
,
'print_tensor_lod'
:
print_tensor_lod
,
'print_phase'
:
print_phase
.
upper
()
'print_phase'
:
print_phase
.
upper
()
},
})
outputs
=
{
'Out'
:
out
})
return
out
class
BlockGuard
(
object
):
class
BlockGuard
(
object
):
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
b98b7440
...
@@ -39,6 +39,7 @@ __all__ = [
...
@@ -39,6 +39,7 @@ __all__ = [
'detection_map'
,
'detection_map'
,
'rpn_target_assign'
,
'rpn_target_assign'
,
'anchor_generator'
,
'anchor_generator'
,
'generate_proposals'
,
]
]
__auto__
=
[
__auto__
=
[
...
@@ -1253,3 +1254,73 @@ def anchor_generator(input,
...
@@ -1253,3 +1254,73 @@ def anchor_generator(input,
anchor
.
stop_gradient
=
True
anchor
.
stop_gradient
=
True
var
.
stop_gradient
=
True
var
.
stop_gradient
=
True
return
anchor
,
var
return
anchor
,
var
def
generate_proposals
(
scores
,
bbox_deltas
,
im_info
,
anchors
,
variances
,
pre_nms_top_n
=
6000
,
post_nms_top_n
=
1000
,
nms_thresh
=
0.5
,
min_size
=
0.1
,
eta
=
1.0
,
name
=
None
):
"""
** Generate proposal labels Faster-RCNN **
This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
could be used to train detection net.
For generating proposals, this operation performs following steps:
1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
2. Calculate box locations as proposals candidates.
3. Clip boxes to image
4. Remove predicted boxes with small area.
5. Apply NMS to get final proposals as output.
Args:
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
N is batch size, A is number of anchors, H and W are height and width of the feature map.
bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location.
im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
between origin image size and the size of feature map.
anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
nms_thresh(float): Threshold in NMS, 0.5 by default.
min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
"""
helper
=
LayerHelper
(
'generate_proposals'
,
**
locals
())
rpn_rois
=
helper
.
create_tmp_variable
(
dtype
=
bbox_deltas
.
dtype
)
rpn_roi_probs
=
helper
.
create_tmp_variable
(
dtype
=
scores
.
dtype
)
helper
.
append_op
(
type
=
"generate_proposals"
,
inputs
=
{
'Scores'
:
scores
,
'BboxDeltas'
:
bbox_deltas
,
'ImInfo'
:
im_info
,
'Anchors'
:
anchors
,
'Variances'
:
variances
},
attrs
=
{
'pre_nms_topN'
:
pre_nms_top_n
,
'post_nms_topN'
:
post_nms_top_n
,
'nms_thresh'
:
nms_thresh
,
'min_size'
:
min_size
,
'eta'
:
eta
},
outputs
=
{
'RpnRois'
:
rpn_rois
,
'RpnRoiProbs'
:
rpn_roi_probs
})
rpn_rois
.
stop_gradient
=
True
rpn_roi_probs
.
stop_gradient
=
True
return
rpn_rois
,
rpn_roi_probs
python/paddle/fluid/layers/io.py
浏览文件 @
b98b7440
...
@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
...
@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
})
})
if
sync
:
if
sync
:
helper
.
append_op
(
type
=
"send_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
helper
.
append_op
(
type
=
"send_barrier"
,
inputs
=
{
"X"
:
dummy_output
},
outputs
=
{
"Out"
:
[]},
attrs
=
{
"endpoints"
:
endpoints
})
def
Recv
(
endpoints
,
get_vars
,
dummy_input
=
None
,
sync
=
True
):
def
Recv
(
endpoints
,
get_vars
,
dummy_input
=
None
,
sync
=
True
):
...
@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
...
@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
attrs
=
{
"endpoints"
:
endpoints
,
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
})
"epmap"
:
epmap
})
if
sync
:
if
sync
:
helper
.
append_op
(
type
=
"fetch_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
helper
.
append_op
(
type
=
"fetch_barrier"
,
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
})
return
get_vars
return
get_vars
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
b98b7440
...
@@ -17,6 +17,7 @@ All layers just related to the neural network.
...
@@ -17,6 +17,7 @@ All layers just related to the neural network.
from
__future__
import
print_function
from
__future__
import
print_function
import
numpy
as
np
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
from
..initializer
import
Normal
,
Constant
from
..initializer
import
Normal
,
Constant
from
..framework
import
Variable
from
..framework
import
Variable
...
@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr
...
@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr
from
.layer_function_generator
import
autodoc
,
templatedoc
from
.layer_function_generator
import
autodoc
,
templatedoc
from
.tensor
import
concat
from
.tensor
import
concat
from
.
import
utils
from
.
import
utils
import
random
from
..
import
unique_name
from
..
import
unique_name
from
functools
import
reduce
from
functools
import
reduce
...
@@ -54,6 +54,7 @@ __all__ = [
...
@@ -54,6 +54,7 @@ __all__ = [
'conv2d_transpose'
,
'conv2d_transpose'
,
'conv3d_transpose'
,
'conv3d_transpose'
,
'sequence_expand'
,
'sequence_expand'
,
'sequence_pad'
,
'lstm_unit'
,
'lstm_unit'
,
'reduce_sum'
,
'reduce_sum'
,
'reduce_mean'
,
'reduce_mean'
,
...
@@ -87,6 +88,7 @@ __all__ = [
...
@@ -87,6 +88,7 @@ __all__ = [
'lod_reset'
,
'lod_reset'
,
'lrn'
,
'lrn'
,
'pad'
,
'pad'
,
'pad_constant_like'
,
'label_smooth'
,
'label_smooth'
,
'roi_pool'
,
'roi_pool'
,
'dice_loss'
,
'dice_loss'
,
...
@@ -105,6 +107,7 @@ __all__ = [
...
@@ -105,6 +107,7 @@ __all__ = [
'flatten'
,
'flatten'
,
'sequence_mask'
,
'sequence_mask'
,
'stack'
,
'stack'
,
'unstack'
,
'sequence_enumerate'
,
'sequence_enumerate'
,
]
]
...
@@ -2656,6 +2659,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
...
@@ -2656,6 +2659,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
return
tmp
return
tmp
@
templatedoc
()
def
sequence_pad
(
x
,
pad_value
,
maxlen
=
None
):
"""
${comment}
Args:
x(Variable): Input variable which should contain lod information.
pad_value(Variable): The Variable that holds values that will be fill
into padded steps. It can be a scalar or a tensor whose shape
equals to time steps in sequences. If it's a scalar, it will be
automatically broadcasted to the shape of time step.
maxlen(int, default None): The length of padded sequences. It can be
None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the
longest original sequence."
Returns:
Variable: The padded sequence batch. All sequences has the same length.
Examples:
.. code-block:: python
import numpy
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
helper
=
LayerHelper
(
'sequence_pad'
,
input
=
x
,
**
locals
())
dtype
=
helper
.
input_dtype
()
out
=
helper
.
create_tmp_variable
(
dtype
)
if
maxlen
is
None
:
maxlen
=
-
1
helper
.
append_op
(
type
=
'sequence_pad'
,
inputs
=
{
'X'
:
x
,
'PadValue'
:
pad_value
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'padded_length'
:
maxlen
})
return
out
def
beam_search
(
pre_ids
,
def
beam_search
(
pre_ids
,
pre_scores
,
pre_scores
,
ids
,
ids
,
...
@@ -4709,6 +4757,86 @@ def pad(x, paddings, pad_value=0., name=None):
...
@@ -4709,6 +4757,86 @@ def pad(x, paddings, pad_value=0., name=None):
return
out
return
out
def
pad_constant_like
(
x
,
y
,
pad_value
=
0.
,
name
=
None
):
"""
Pad input(Y) with :attr:`pad_value`, the number of values padded to
the edges of each axis is specified by the difference of the shape
of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
unique pad widths for each axis. The input should be a k-D
tensor(k > 0 and k < 7).
See below for an example.
.. code-block:: text
Given:
X = [[[[ 0, 1, 2],
[ 3, 4, 5]],
[[ 6, 7, 8],
[ 9, 10, 11]],
[[12, 13, 14],
[15, 16, 17]]],
[[[18, 19, 20],
[21, 22, 23]],
[[24, 25, 26],
[27, 28, 29]],
[[30, 31, 32],
[33, 34, 35]]]]
X.shape = (2, 3, 2, 3)
Y = [[[[35, 36, 37]],
[[38, 39, 40]],
[[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
Return:
Out = [[[[35, 36, 37],
[-1, -1, -1]],
[[38, 39, 40],
[-1, -1, -1]],
[[41, 42, 43],
[-1, -1, -1]]],
[[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]],
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
Args:
x (Variable): The input tensor variable.
y (Variable): The input tensor variable.
pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The padded tensor variable.
Examples:
.. code-block:: python
# x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
# y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
# out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
"""
helper
=
LayerHelper
(
'pad_constant_like'
,
input
=
x
,
**
locals
())
dtype
=
helper
.
input_dtype
()
out
=
helper
.
create_tmp_variable
(
dtype
)
helper
.
append_op
(
type
=
'pad_constant_like'
,
inputs
=
{
'X'
:
x
,
'Y'
:
y
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'pad_value'
:
float
(
pad_value
)})
return
out
def
label_smooth
(
label
,
def
label_smooth
(
label
,
prior_dist
=
None
,
prior_dist
=
None
,
epsilon
=
0.1
,
epsilon
=
0.1
,
...
@@ -5103,7 +5231,7 @@ def random_crop(x, shape, seed=None):
...
@@ -5103,7 +5231,7 @@ def random_crop(x, shape, seed=None):
dtype
=
x
.
dtype
dtype
=
x
.
dtype
out
=
helper
.
create_tmp_variable
(
dtype
)
out
=
helper
.
create_tmp_variable
(
dtype
)
if
seed
is
None
:
if
seed
is
None
:
seed
=
random
.
randint
(
-
65536
,
65535
)
seed
=
np
.
random
.
randint
(
-
65536
,
65536
)
op_attrs
=
{
"shape"
:
shape
}
op_attrs
=
{
"shape"
:
shape
}
if
isinstance
(
seed
,
int
):
if
isinstance
(
seed
,
int
):
op_attrs
[
"startup_seed"
]
=
seed
op_attrs
[
"startup_seed"
]
=
seed
...
@@ -5649,3 +5777,44 @@ def stack(x, axis=0):
...
@@ -5649,3 +5777,44 @@ def stack(x, axis=0):
attrs
=
{
'axis'
:
axis
})
attrs
=
{
'axis'
:
axis
})
return
out
return
out
def
unstack
(
x
,
axis
=
0
,
num
=
None
):
"""
**UnStack Layer**
This layer unstacks input :code:`x` into several tensors along axis.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
raised.
Args:
x (Variable): Input variable.
axis (int): The axis along which the input is unstacked.
num (int|None): The number of output variables.
Returns:
list(Variable): The unstacked variables.
"""
helper
=
LayerHelper
(
'unstack'
,
**
locals
())
if
num
is
None
:
if
axis
is
None
or
x
.
shape
[
axis
]
<=
0
:
raise
ValueError
(
'unknown unstack number'
)
else
:
num
=
x
.
shape
[
axis
]
outs
=
[]
for
_
in
num
:
outs
.
append
(
helper
.
create_tmp_variable
(
x
.
dtype
))
helper
.
append_op
(
type
=
'unstack'
,
inputs
=
{
'X'
:
[
x
]},
outputs
=
{
'Y'
:
outs
},
attrs
=
{
'axis'
:
axis
,
'num'
:
num
})
return
outs
python/paddle/fluid/optimizer.py
浏览文件 @
b98b7440
...
@@ -46,10 +46,12 @@ class Optimizer(object):
...
@@ -46,10 +46,12 @@ class Optimizer(object):
def
__init__
(
self
,
def
__init__
(
self
,
learning_rate
,
learning_rate
,
regularization
=
None
,
regularization
=
None
,
LARS_weight_decay
=
0.0
):
LARS_weight_decay
=
0.0
,
name
=
None
):
if
not
isinstance
(
learning_rate
,
float
)
and
\
if
not
isinstance
(
learning_rate
,
float
)
and
\
not
isinstance
(
learning_rate
,
framework
.
Variable
):
not
isinstance
(
learning_rate
,
framework
.
Variable
):
raise
TypeError
(
"learning rate should be float or Variable"
)
raise
TypeError
(
"learning rate should be float or Variable"
)
self
.
_name
=
name
self
.
regularization
=
regularization
self
.
regularization
=
regularization
self
.
_learning_rate
=
learning_rate
self
.
_learning_rate
=
learning_rate
# the learning rate type should be inferenced from loss
# the learning rate type should be inferenced from loss
...
@@ -153,6 +155,8 @@ class Optimizer(object):
...
@@ -153,6 +155,8 @@ class Optimizer(object):
dtype: data type of the accumulator variable
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
fill_value: value to initialize the accumulator variable
"""
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
if
(
name
in
self
.
_accumulators
and
if
(
name
in
self
.
_accumulators
and
param
.
name
in
self
.
_accumulators
[
name
]):
param
.
name
in
self
.
_accumulators
[
name
]):
raise
Exception
(
"Accumulator {} already exists for parameter {}"
.
raise
Exception
(
"Accumulator {} already exists for parameter {}"
.
...
@@ -181,6 +185,8 @@ class Optimizer(object):
...
@@ -181,6 +185,8 @@ class Optimizer(object):
Returns:
Returns:
accumulator variable for the parameter
accumulator variable for the parameter
"""
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
if
(
name
not
in
self
.
_accumulators
or
if
(
name
not
in
self
.
_accumulators
or
param
.
name
not
in
self
.
_accumulators
[
name
]):
param
.
name
not
in
self
.
_accumulators
[
name
]):
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
...
...
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
浏览文件 @
b98b7440
...
@@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost)
...
@@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
#
fluid.memory_optimize(fluid.default_main_program(), level=0)
fluid
.
memory_optimize
(
fluid
.
default_main_program
(),
level
=
0
)
fluid
.
release_memory
(
fluid
.
default_main_program
())
#
fluid.release_memory(fluid.default_main_program())
BATCH_SIZE
=
16
BATCH_SIZE
=
16
PASS_NUM
=
1
PASS_NUM
=
1
...
...
python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
浏览文件 @
b98b7440
...
@@ -92,8 +92,8 @@ def main():
...
@@ -92,8 +92,8 @@ def main():
optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
1e-4
)
optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
1e-4
)
optimizer
.
minimize
(
avg_cost
)
optimizer
.
minimize
(
avg_cost
)
#
fluid.memory_optimize(fluid.default_main_program())
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
release_memory
(
fluid
.
default_main_program
())
#
fluid.release_memory(fluid.default_main_program())
# fix the order of training data
# fix the order of training data
train_data
=
paddle
.
batch
(
train_data
=
paddle
.
batch
(
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
b98b7440
...
@@ -201,5 +201,44 @@ class TestDetectionMAP(unittest.TestCase):
...
@@ -201,5 +201,44 @@ class TestDetectionMAP(unittest.TestCase):
print
(
str
(
program
))
print
(
str
(
program
))
class
TestGenerateProposals
(
unittest
.
TestCase
):
def
test_generate_proposals
(
self
):
data_shape
=
[
20
,
64
,
64
]
images
=
fluid
.
layers
.
data
(
name
=
'images'
,
shape
=
data_shape
,
dtype
=
'float32'
)
im_info
=
fluid
.
layers
.
data
(
name
=
'im_info'
,
shape
=
[
1
,
3
],
dtype
=
'float32'
)
anchors
,
variances
=
fluid
.
layers
.
anchor_generator
(
name
=
'anchor_generator'
,
input
=
images
,
anchor_sizes
=
[
32
,
64
],
aspect_ratios
=
[
1.0
],
variance
=
[
0.1
,
0.1
,
0.2
,
0.2
],
stride
=
[
16.0
,
16.0
],
offset
=
0.5
)
num_anchors
=
anchors
.
shape
[
2
]
scores
=
fluid
.
layers
.
data
(
name
=
'scores'
,
shape
=
[
1
,
num_anchors
,
8
,
8
],
dtype
=
'float32'
)
bbox_deltas
=
fluid
.
layers
.
data
(
name
=
'bbox_deltas'
,
shape
=
[
1
,
num_anchors
*
4
,
8
,
8
],
dtype
=
'float32'
)
rpn_rois
,
rpn_roi_probs
=
fluid
.
layers
.
generate_proposals
(
name
=
'generate_proposals'
,
scores
=
scores
,
bbox_deltas
=
bbox_deltas
,
im_info
=
im_info
,
anchors
=
anchors
,
variances
=
variances
,
pre_nms_top_n
=
6000
,
post_nms_top_n
=
1000
,
nms_thresh
=
0.5
,
min_size
=
0.1
,
eta
=
1.0
)
self
.
assertIsNotNone
(
rpn_rois
)
self
.
assertIsNotNone
(
rpn_roi_probs
)
print
(
rpn_rois
.
shape
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/dist_se_resnext.py
浏览文件 @
b98b7440
...
@@ -134,7 +134,7 @@ class SE_ResNeXt():
...
@@ -134,7 +134,7 @@ class SE_ResNeXt():
size
=
class_dim
,
size
=
class_dim
,
act
=
'softmax'
,
act
=
'softmax'
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.
2
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.
05
)))
return
out
return
out
def
shortcut
(
self
,
input
,
ch_out
,
stride
):
def
shortcut
(
self
,
input
,
ch_out
,
stride
):
...
@@ -184,7 +184,7 @@ class SE_ResNeXt():
...
@@ -184,7 +184,7 @@ class SE_ResNeXt():
act
=
None
,
act
=
None
,
# avoid pserver CPU init differs from GPU
# avoid pserver CPU init differs from GPU
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.
2
)),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.
05
)),
bias_attr
=
False
)
bias_attr
=
False
)
return
fluid
.
layers
.
batch_norm
(
input
=
conv
,
act
=
act
)
return
fluid
.
layers
.
batch_norm
(
input
=
conv
,
act
=
act
)
...
@@ -192,12 +192,18 @@ class SE_ResNeXt():
...
@@ -192,12 +192,18 @@ class SE_ResNeXt():
pool
=
fluid
.
layers
.
pool2d
(
pool
=
fluid
.
layers
.
pool2d
(
input
=
input
,
pool_size
=
0
,
pool_type
=
'avg'
,
global_pooling
=
True
)
input
=
input
,
pool_size
=
0
,
pool_type
=
'avg'
,
global_pooling
=
True
)
stdv
=
1.0
/
math
.
sqrt
(
pool
.
shape
[
1
]
*
1.0
)
stdv
=
1.0
/
math
.
sqrt
(
pool
.
shape
[
1
]
*
1.0
)
squeeze
=
fluid
.
layers
.
fc
(
input
=
pool
,
squeeze
=
fluid
.
layers
.
fc
(
input
=
pool
,
size
=
num_channels
//
reduction_ratio
,
size
=
num_channels
//
reduction_ratio
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)),
act
=
'relu'
)
act
=
'relu'
)
stdv
=
1.0
/
math
.
sqrt
(
squeeze
.
shape
[
1
]
*
1.0
)
stdv
=
1.0
/
math
.
sqrt
(
squeeze
.
shape
[
1
]
*
1.0
)
excitation
=
fluid
.
layers
.
fc
(
input
=
squeeze
,
excitation
=
fluid
.
layers
.
fc
(
input
=
squeeze
,
size
=
num_channels
,
size
=
num_channels
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.05
)),
act
=
'sigmoid'
)
act
=
'sigmoid'
)
scale
=
fluid
.
layers
.
elementwise_mul
(
x
=
input
,
y
=
excitation
,
axis
=
0
)
scale
=
fluid
.
layers
.
elementwise_mul
(
x
=
input
,
y
=
excitation
,
axis
=
0
)
return
scale
return
scale
...
...
python/paddle/fluid/tests/unittests/dist_transformer.py
浏览文件 @
b98b7440
...
@@ -18,54 +18,129 @@ import numpy as np
...
@@ -18,54 +18,129 @@ import numpy as np
import
argparse
import
argparse
import
time
import
time
import
math
import
math
import
os
import
sys
import
six
import
argparse
import
ast
import
multiprocessing
import
time
from
functools
import
partial
from
os.path
import
expanduser
import
glob
import
random
import
tarfile
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
from
paddle.fluid
import
core
from
paddle.fluid
import
core
import
os
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
import
sys
from
paddle.compat
import
long_type
import
six
import
transformer_model
import
hashlib
import
paddle.dataset.wmt16
as
wmt16
from
paddle.fluid.transpiler.details
import
program_to_code
const_para_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
0.001
))
const_bias_attr
=
const_para_attr
# Fix seed for test
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
WMT16_RECORDIO_FILE
=
"/tmp/wmt16.recordio"
#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list
class
TrainTaskConfig
(
object
):
# only support GPU currently
use_gpu
=
True
# the epoch number to train.
pass_num
=
1
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size
=
20
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate
=
1
beta1
=
0.9
beta2
=
0.98
eps
=
1e-9
# the parameters for learning rate scheduling.
warmup_steps
=
4000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps
=
0.1
# the directory for saving trained models.
model_dir
=
"trained_models"
# the directory for saving checkpoints.
ckpt_dir
=
"trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path
=
None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step
=
0
class
ModelHyperParams
(
object
):
check_acc
=
True
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
data_path
=
expanduser
(
"~"
)
+
(
src_vocab_size
=
10000
"/.cache/paddle/dataset/test_dist_transformer/"
)
# index for <pad> token in source language.
src_vocab_fpath
=
data_path
+
"vocab.bpe.32000"
src_pad_idx
=
src_vocab_size
trg_vocab_fpath
=
data_path
+
"vocab.bpe.32000"
train_file_pattern
=
data_path
+
"train.tok.clean.bpe.32000.en-de"
val_file_pattern
=
data_path
+
"newstest2013.tok.bpe.32000.en-de"
pool_size
=
2000
sort_type
=
None
local
=
True
shuffle
=
False
shuffle_batch
=
False
special_token
=
[
'<s>'
,
'<e>'
,
'<unk>'
]
token_delimiter
=
' '
use_token_batch
=
False
# size of target word dictionay
trg_vocab_size
=
10000
# index for <pad> token in target language.
trg_pad_idx
=
trg_vocab_size
# position value corresponding to the <pad> token.
class
InferTaskConfig
(
object
):
pos_pad_idx
=
0
use_gpu
=
True
# the number of examples in one run for sequence generation.
batch_size
=
10
# the parameters for beam search.
beam_size
=
5
max_out_len
=
256
# the number of decoded sentences to output.
n_best
=
1
# the flags indicating whether to output the special tokens.
output_bos
=
False
output_eos
=
False
output_unk
=
True
# the directory for loading the trained model.
model_path
=
"trained_models/pass_1.infer.model"
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length
=
50
class
ModelHyperParams
(
object
):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size
=
10000
# size of target word dictionay
trg_vocab_size
=
10000
# index for <bos> token
bos_idx
=
0
# index for <eos> token
eos_idx
=
1
# index for <unk> token
unk_idx
=
2
# max length of sequences deciding the size of position encoding table.
# Start from 1 and count start and end tokens in.
max_length
=
256
# the dimension for word embeddings, which is also the last dimension of
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
# networks, encoder and decoder.
d_model
=
512
d_model
=
512
# size of the hidden layer in position-wise feed-forward networks.
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid
=
1024
d_inner_hid
=
2048
# the dimension that keys are projected to for dot-product attention.
# the dimension that keys are projected to for dot-product attention.
d_key
=
64
d_key
=
64
# the dimension that values are projected to for dot-product attention.
# the dimension that values are projected to for dot-product attention.
...
@@ -75,46 +150,195 @@ class ModelHyperParams(object):
...
@@ -75,46 +150,195 @@ class ModelHyperParams(object):
# number of sub-layers to be stacked in the encoder and decoder.
# number of sub-layers to be stacked in the encoder and decoder.
n_layer
=
6
n_layer
=
6
# dropout rate used by all dropout layers.
# dropout rate used by all dropout layers.
dropout
=
0.1
dropout
=
0.0
# no random
# random seed used in dropout for CE.
dropout_seed
=
None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing
=
True
def
prepare_batch_input
(
insts
,
src_pad_idx
,
trg_pad_idx
,
n_head
):
def
merge_cfg_from_list
(
cfg_list
,
g_cfgs
):
"""
"""
Pad the instances to the max sequence length in batch, and generate the
Set the above global configurations using the cfg_list.
corresponding position data and attention bias. Then, convert the numpy
"""
data to tensors and return a dict mapping names to tensors.
assert
len
(
cfg_list
)
%
2
==
0
for
key
,
value
in
zip
(
cfg_list
[
0
::
2
],
cfg_list
[
1
::
2
]):
for
g_cfg
in
g_cfgs
:
if
hasattr
(
g_cfg
,
key
):
try
:
value
=
eval
(
value
)
except
Exception
:
# for file path
pass
setattr
(
g_cfg
,
key
,
value
)
break
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size
=
-
1
# The placeholder for squence length in compile time.
seq_len
=
ModelHyperParams
.
max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs
=
{
# The actual data shape of src_word is:
# [batch_size * max_src_len_in_batch, 1]
"src_word"
:
[(
batch_size
,
seq_len
,
long_type
(
1
)),
"int64"
,
2
],
# The actual data shape of src_pos is:
# [batch_size * max_src_len_in_batch, 1]
"src_pos"
:
[(
batch_size
,
seq_len
,
long_type
(
1
)),
"int64"
],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias"
:
[(
batch_size
,
ModelHyperParams
.
n_head
,
seq_len
,
seq_len
),
"float32"
],
# The actual data shape of trg_word is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_word"
:
[(
batch_size
,
seq_len
,
long_type
(
1
)),
"int64"
,
2
],
# lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_pos"
:
[(
batch_size
,
seq_len
,
long_type
(
1
)),
"int64"
],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias"
:
[(
batch_size
,
ModelHyperParams
.
n_head
,
seq_len
,
seq_len
),
"float32"
],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias"
:
[(
batch_size
,
ModelHyperParams
.
n_head
,
seq_len
,
seq_len
),
"float32"
],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output"
:
[(
batch_size
,
seq_len
,
ModelHyperParams
.
d_model
),
"float32"
],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word"
:
[(
batch_size
*
seq_len
,
long_type
(
1
)),
"int64"
],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight"
:
[(
batch_size
*
seq_len
,
long_type
(
1
)),
"float32"
],
# These inputs are used to change the shape tensor in beam-search decoder.
"trg_slf_attn_pre_softmax_shape_delta"
:
[(
long_type
(
2
),
),
"int32"
],
"trg_slf_attn_post_softmax_shape_delta"
:
[(
long_type
(
4
),
),
"int32"
],
"init_score"
:
[(
batch_size
,
long_type
(
1
)),
"float32"
],
}
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names
=
(
"src_word_emb_table"
,
"trg_word_emb_table"
,
)
# Names of position encoding table which will be initialized externally.
pos_enc_param_names
=
(
"src_pos_enc_table"
,
"trg_pos_enc_table"
,
)
# separated inputs for different usages.
encoder_data_input_fields
=
(
"src_word"
,
"src_pos"
,
"src_slf_attn_bias"
,
)
decoder_data_input_fields
=
(
"trg_word"
,
"trg_pos"
,
"trg_slf_attn_bias"
,
"trg_src_attn_bias"
,
"enc_output"
,
)
label_data_input_fields
=
(
"lbl_word"
,
"lbl_weight"
,
)
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields
=
(
"trg_word"
,
"init_score"
,
"trg_src_attn_bias"
,
)
# fast_decoder_util_input_fields = (
# "trg_slf_attn_pre_softmax_shape_delta",
# "trg_slf_attn_post_softmax_shape_delta", )
#from optim import LearningRateScheduler
class
LearningRateScheduler
(
object
):
"""
Wrapper for learning rate scheduling as described in the Transformer paper.
LearningRateScheduler adapts the learning rate externally and the adapted
learning rate will be feeded into the main_program as input data.
"""
"""
def
__pad_batch_data
(
insts
,
def
__init__
(
self
,
d_model
,
warmup_steps
,
learning_rate
=
0.001
,
current_steps
=
0
,
name
=
"learning_rate"
):
self
.
current_steps
=
current_steps
self
.
warmup_steps
=
warmup_steps
self
.
d_model
=
d_model
self
.
static_lr
=
learning_rate
self
.
learning_rate
=
layers
.
create_global_var
(
name
=
name
,
shape
=
[
1
],
value
=
float
(
learning_rate
),
dtype
=
"float32"
,
persistable
=
True
)
def
update_learning_rate
(
self
):
self
.
current_steps
+=
1
lr_value
=
np
.
power
(
self
.
d_model
,
-
0.5
)
*
np
.
min
([
np
.
power
(
self
.
current_steps
,
-
0.5
),
np
.
power
(
self
.
warmup_steps
,
-
1.5
)
*
self
.
current_steps
])
*
self
.
static_lr
return
np
.
array
([
lr_value
],
dtype
=
"float32"
)
#from transformer_train import train_loop
def
pad_batch_data
(
insts
,
pad_idx
,
pad_idx
,
n_head
,
is_target
=
False
,
is_target
=
False
,
return_pos
=
Tru
e
,
is_label
=
Fals
e
,
return_attn_bias
=
True
,
return_attn_bias
=
True
,
return_max_len
=
True
):
return_max_len
=
True
,
return_num_token
=
False
):
"""
"""
Pad the instances to the max sequence length in batch, and generate the
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
corresponding position data and attention bias.
"""
"""
return_list
=
[]
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
num_token
=
reduce
(
lambda
x
,
y
:
x
+
y
,
[
len
(
inst
)
for
inst
in
insts
])
if
return_num_token
else
0
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data
=
np
.
array
(
inst_data
=
np
.
array
(
[
inst
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
[
inst
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_pos
:
if
is_label
:
# label weight
inst_pos
=
np
.
array
([[
inst_weight
=
np
.
array
(
pos_i
+
1
if
w_i
!=
pad_idx
else
0
[[
1.
]
*
len
(
inst
)
+
[
0.
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
for
pos_i
,
w_i
in
enumerate
(
inst
)
return_list
+=
[
inst_weight
.
astype
(
"float32"
).
reshape
([
-
1
,
1
])]
]
for
inst
in
inst_data
])
else
:
# position data
inst_pos
=
np
.
array
([
range
(
1
,
len
(
inst
)
+
1
)
+
[
0
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_attn_bias
:
if
return_attn_bias
:
if
is_target
:
if
is_target
:
# This is used to avoid attention on paddings and subsequent
# This is used to avoid attention on paddings and subsequent
# words.
# words.
slf_attn_bias_data
=
np
.
ones
((
inst_data
.
shape
[
0
],
max_len
,
slf_attn_bias_data
=
np
.
ones
((
inst_data
.
shape
[
0
],
max_len
,
max_len
))
max_len
))
slf_attn_bias_data
=
np
.
triu
(
slf_attn_bias_data
,
slf_attn_bias_data
=
np
.
triu
(
slf_attn_bias_data
,
1
).
reshape
(
1
).
reshape
([
-
1
,
1
,
max_len
,
max_len
])
[
-
1
,
1
,
max_len
,
max_len
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
,
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
,
[
1
,
n_head
,
1
,
1
])
*
[
-
1e9
]
[
1
,
n_head
,
1
,
1
])
*
[
-
1e9
]
else
:
else
:
...
@@ -128,42 +352,1319 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
...
@@ -128,42 +352,1319 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
return_list
+=
[
slf_attn_bias_data
.
astype
(
"float32"
)]
return_list
+=
[
slf_attn_bias_data
.
astype
(
"float32"
)]
if
return_max_len
:
if
return_max_len
:
return_list
+=
[
max_len
]
return_list
+=
[
max_len
]
if
return_num_token
:
return_list
+=
[
num_token
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
src_word
,
src_pos
,
src_slf_attn_bias
,
src_max_len
=
__pad_batch_data
(
[
inst
[
0
]
for
inst
in
insts
],
src_pad_idx
,
is_target
=
False
)
def
prepare_batch_input
(
insts
,
data_input_names
,
src_pad_idx
,
trg_pad_idx
,
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_max_len
=
__pad_batch_data
(
n_head
,
d_model
):
[
inst
[
1
]
for
inst
in
insts
],
trg_pad_idx
,
is_target
=
True
)
"""
Put all padded data needed by training into a dict.
"""
src_word
,
src_pos
,
src_slf_attn_bias
,
src_max_len
=
pad_batch_data
(
[
inst
[
0
]
for
inst
in
insts
],
src_pad_idx
,
n_head
,
is_target
=
False
)
src_word
=
src_word
.
reshape
(
-
1
,
src_max_len
,
1
)
src_pos
=
src_pos
.
reshape
(
-
1
,
src_max_len
,
1
)
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_max_len
=
pad_batch_data
(
[
inst
[
1
]
for
inst
in
insts
],
trg_pad_idx
,
n_head
,
is_target
=
True
)
trg_word
=
trg_word
.
reshape
(
-
1
,
trg_max_len
,
1
)
trg_pos
=
trg_pos
.
reshape
(
-
1
,
trg_max_len
,
1
)
trg_src_attn_bias
=
np
.
tile
(
src_slf_attn_bias
[:,
:,
::
src_max_len
,
:],
trg_src_attn_bias
=
np
.
tile
(
src_slf_attn_bias
[:,
:,
::
src_max_len
,
:],
[
1
,
1
,
trg_max_len
,
1
]).
astype
(
"float32"
)
[
1
,
1
,
trg_max_len
,
1
]).
astype
(
"float32"
)
lbl_word
=
__pad_batch_data
([
inst
[
2
]
for
inst
in
insts
],
trg_pad_idx
,
False
,
False
,
False
,
False
)
lbl_weight
=
(
lbl_word
!=
trg_pad_idx
).
astype
(
"float32"
).
reshape
([
-
1
,
1
])
return
[
lbl_word
,
lbl_weight
,
num_token
=
pad_batch_data
(
src_word
,
src_pos
,
trg_word
,
trg_pos
,
src_slf_attn_bias
,
[
inst
[
2
]
for
inst
in
insts
],
trg_pad_idx
,
n_head
,
is_target
=
False
,
is_label
=
True
,
return_attn_bias
=
False
,
return_max_len
=
False
,
return_num_token
=
True
)
data_input_dict
=
dict
(
zip
(
data_input_names
,
[
src_word
,
src_pos
,
src_slf_attn_bias
,
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
lbl_word
,
lbl_weight
trg_slf_attn_bias
,
trg_src_attn_bias
,
lbl_word
,
lbl_weight
]))
return
data_input_dict
,
np
.
asarray
([
num_token
],
dtype
=
"float32"
)
def
read_multiple
(
reader
,
count
,
clip_last
=
True
):
"""
Stack data from reader for multi-devices.
"""
def
__impl__
():
res
=
[]
for
item
in
reader
():
res
.
append
(
item
)
if
len
(
res
)
==
count
:
yield
res
res
=
[]
if
len
(
res
)
==
count
:
yield
res
elif
not
clip_last
:
data
=
[]
for
item
in
res
:
data
+=
item
if
len
(
data
)
>
count
:
inst_num_per_part
=
len
(
data
)
//
count
yield
[
data
[
inst_num_per_part
*
i
:
inst_num_per_part
*
(
i
+
1
)]
for
i
in
range
(
count
)
]
]
return
__impl__
def
split_data
(
data
,
num_part
):
"""
Split data for each device.
"""
if
len
(
data
)
==
num_part
:
return
data
data
=
data
[
0
]
inst_num_per_part
=
len
(
data
)
//
num_part
return
[
data
[
inst_num_per_part
*
i
:
inst_num_per_part
*
(
i
+
1
)]
for
i
in
range
(
num_part
)
]
def
test_context
(
train_progm
,
avg_cost
,
train_exe
,
dev_count
,
data_input_names
,
sum_cost
,
token_num
):
# Context to do validation.
test_program
=
train_progm
.
clone
()
with
fluid
.
program_guard
(
test_program
):
test_program
=
fluid
.
io
.
get_inference_program
([
avg_cost
])
val_data
=
DataReader
(
src_vocab_fpath
=
TrainTaskConfig
.
src_vocab_fpath
,
trg_vocab_fpath
=
TrainTaskConfig
.
trg_vocab_fpath
,
fpattern
=
TrainTaskConfig
.
val_file_pattern
,
token_delimiter
=
TrainTaskConfig
.
token_delimiter
,
use_token_batch
=
TrainTaskConfig
.
use_token_batch
,
batch_size
=
TrainTaskConfig
.
batch_size
*
(
1
if
TrainTaskConfig
.
use_token_batch
else
dev_count
),
pool_size
=
TrainTaskConfig
.
pool_size
,
sort_type
=
TrainTaskConfig
.
sort_type
,
start_mark
=
TrainTaskConfig
.
special_token
[
0
],
end_mark
=
TrainTaskConfig
.
special_token
[
1
],
unk_mark
=
TrainTaskConfig
.
special_token
[
2
],
# count start and end tokens out
max_length
=
ModelHyperParams
.
max_length
-
2
,
clip_last_batch
=
False
,
shuffle
=
False
,
shuffle_batch
=
False
)
build_strategy
=
fluid
.
BuildStrategy
()
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
TrainTaskConfig
.
use_gpu
,
main_program
=
test_program
,
share_vars_from
=
train_exe
,
build_strategy
=
build_strategy
,
exec_strategy
=
strategy
)
def
test
(
exe
=
test_exe
):
test_total_cost
=
0
test_total_token
=
0
test_data
=
read_multiple
(
reader
=
val_data
.
batch_generator
,
count
=
dev_count
if
TrainTaskConfig
.
use_token_batch
else
1
)
for
batch_id
,
data
in
enumerate
(
test_data
()):
feed_list
=
[]
for
place_id
,
data_buffer
in
enumerate
(
split_data
(
data
,
num_part
=
dev_count
)):
data_input_dict
,
_
=
prepare_batch_input
(
data_buffer
,
data_input_names
,
ModelHyperParams
.
eos_idx
,
ModelHyperParams
.
eos_idx
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_model
)
feed_list
.
append
(
data_input_dict
)
outs
=
exe
.
run
(
feed
=
feed_list
,
fetch_list
=
[
sum_cost
.
name
,
token_num
.
name
])
sum_cost_val
,
token_num_val
=
np
.
array
(
outs
[
0
]),
np
.
array
(
outs
[
1
])
test_total_cost
+=
sum_cost_val
.
sum
()
test_total_token
+=
token_num_val
.
sum
()
test_avg_cost
=
test_total_cost
/
test_total_token
test_ppl
=
np
.
exp
([
min
(
test_avg_cost
,
100
)])
return
test_avg_cost
,
test_ppl
return
test
def
train_loop
(
exe
,
train_progm
,
dev_count
,
sum_cost
,
avg_cost
,
lr_scheduler
,
token_num
,
predict
):
# Initialize the parameters.
if
TrainTaskConfig
.
ckpt_path
:
lr_scheduler
.
current_steps
=
TrainTaskConfig
.
start_step
else
:
exe
.
run
(
fluid
.
framework
.
default_startup_program
())
train_data
=
DataReader
(
src_vocab_fpath
=
TrainTaskConfig
.
src_vocab_fpath
,
trg_vocab_fpath
=
TrainTaskConfig
.
trg_vocab_fpath
,
fpattern
=
TrainTaskConfig
.
train_file_pattern
,
token_delimiter
=
TrainTaskConfig
.
token_delimiter
,
use_token_batch
=
TrainTaskConfig
.
use_token_batch
,
batch_size
=
TrainTaskConfig
.
batch_size
*
(
1
if
TrainTaskConfig
.
use_token_batch
else
dev_count
),
pool_size
=
TrainTaskConfig
.
pool_size
,
sort_type
=
TrainTaskConfig
.
sort_type
,
shuffle
=
TrainTaskConfig
.
shuffle
,
shuffle_batch
=
TrainTaskConfig
.
shuffle_batch
,
start_mark
=
TrainTaskConfig
.
special_token
[
0
],
end_mark
=
TrainTaskConfig
.
special_token
[
1
],
unk_mark
=
TrainTaskConfig
.
special_token
[
2
],
# count start and end tokens out
max_length
=
ModelHyperParams
.
max_length
-
2
,
clip_last_batch
=
False
)
train_data
=
read_multiple
(
reader
=
train_data
.
batch_generator
,
count
=
dev_count
if
TrainTaskConfig
.
use_token_batch
else
1
)
build_strategy
=
fluid
.
BuildStrategy
()
# Since the token number differs among devices, customize gradient scale to
# use token average cost among multi-devices. and the gradient scale is
# `1 / token_number` for average cost.
build_strategy
.
gradient_scale_strategy
=
fluid
.
BuildStrategy
.
GradientScaleStrategy
.
Customized
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
TrainTaskConfig
.
use_gpu
,
loss_name
=
sum_cost
.
name
,
main_program
=
train_progm
,
build_strategy
=
build_strategy
,
exec_strategy
=
strategy
)
data_input_names
=
encoder_data_input_fields
+
decoder_data_input_fields
[:
-
1
]
+
label_data_input_fields
if
TrainTaskConfig
.
val_file_pattern
is
not
None
:
test
=
test_context
(
train_progm
,
avg_cost
,
train_exe
,
dev_count
,
data_input_names
,
sum_cost
,
token_num
)
# the best cross-entropy value with label smoothing
loss_normalizer
=
-
((
1.
-
TrainTaskConfig
.
label_smooth_eps
)
*
np
.
log
(
(
1.
-
TrainTaskConfig
.
label_smooth_eps
))
+
TrainTaskConfig
.
label_smooth_eps
*
np
.
log
(
TrainTaskConfig
.
label_smooth_eps
/
(
ModelHyperParams
.
trg_vocab_size
-
1
)
+
1e-20
))
init
=
False
for
pass_id
in
xrange
(
TrainTaskConfig
.
pass_num
):
pass_start_time
=
time
.
time
()
for
batch_id
,
data
in
enumerate
(
train_data
()):
if
batch_id
>=
5
:
break
feed_list
=
[]
total_num_token
=
0
#if TrainTaskConfig.local:
# lr_rate = lr_scheduler.update_learning_rate()
#for place_id, data_buffer in enumerate(
# split_data(
# data, num_part=dev_count)):
if
TrainTaskConfig
.
local
:
lr_rate
=
lr_scheduler
.
update_learning_rate
()
for
place_id
,
data_buffer
in
enumerate
(
split_data
(
data
,
num_part
=
dev_count
)):
data_input_dict
,
num_token
=
prepare_batch_input
(
data_buffer
,
data_input_names
,
ModelHyperParams
.
eos_idx
,
ModelHyperParams
.
eos_idx
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_model
)
total_num_token
+=
num_token
feed_kv_pairs
=
data_input_dict
.
items
()
if
TrainTaskConfig
.
local
:
feed_kv_pairs
+=
{
lr_scheduler
.
learning_rate
.
name
:
lr_rate
}.
items
()
feed_list
.
append
(
dict
(
feed_kv_pairs
))
if
not
init
:
for
pos_enc_param_name
in
pos_enc_param_names
:
pos_enc
=
position_encoding_init
(
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
d_model
)
feed_list
[
place_id
][
pos_enc_param_name
]
=
pos_enc
if
not
TrainTaskConfig
.
check_acc
:
for
feed_dict
in
feed_list
:
feed_dict
[
sum_cost
.
name
+
"@GRAD"
]
=
1.
/
total_num_token
else
:
b
=
100
*
TrainTaskConfig
.
batch_size
a
=
np
.
asarray
([
b
],
dtype
=
"float32"
)
for
feed_dict
in
feed_list
:
feed_dict
[
sum_cost
.
name
+
"@GRAD"
]
=
1.
/
a
outs
=
train_exe
.
run
(
fetch_list
=
[
sum_cost
.
name
,
token_num
.
name
],
feed
=
feed_list
)
sum_cost_val
,
token_num_val
=
np
.
array
(
outs
[
0
]),
np
.
array
(
outs
[
1
])
total_sum_cost
=
sum_cost_val
.
sum
()
total_token_num
=
token_num_val
.
sum
()
total_avg_cost
=
total_sum_cost
/
total_token_num
init
=
True
# Validate and save the model for inference.
if
TrainTaskConfig
.
val_file_pattern
is
not
None
:
val_avg_cost
,
val_ppl
=
test
()
print
(
"[%f]"
%
val_avg_cost
)
else
:
assert
(
False
)
#import transformer_reader as reader
class
SortType
(
object
):
GLOBAL
=
'global'
POOL
=
'pool'
NONE
=
"none"
class
Converter
(
object
):
def
__init__
(
self
,
vocab
,
beg
,
end
,
unk
,
delimiter
):
self
.
_vocab
=
vocab
self
.
_beg
=
beg
self
.
_end
=
end
self
.
_unk
=
unk
self
.
_delimiter
=
delimiter
def
__call__
(
self
,
sentence
):
return
[
self
.
_beg
]
+
[
self
.
_vocab
.
get
(
w
,
self
.
_unk
)
for
w
in
sentence
.
split
(
self
.
_delimiter
)
]
+
[
self
.
_end
]
class
ComposedConverter
(
object
):
def
__init__
(
self
,
converters
):
self
.
_converters
=
converters
def
__call__
(
self
,
parallel_sentence
):
return
[
self
.
_converters
[
i
](
parallel_sentence
[
i
])
for
i
in
range
(
len
(
self
.
_converters
))
]
class
SentenceBatchCreator
(
object
):
def
__init__
(
self
,
batch_size
):
self
.
batch
=
[]
self
.
_batch_size
=
batch_size
def
append
(
self
,
info
):
self
.
batch
.
append
(
info
)
if
len
(
self
.
batch
)
==
self
.
_batch_size
:
tmp
=
self
.
batch
self
.
batch
=
[]
return
tmp
class
TokenBatchCreator
(
object
):
def
__init__
(
self
,
batch_size
):
self
.
batch
=
[]
self
.
max_len
=
-
1
self
.
_batch_size
=
batch_size
def
append
(
self
,
info
):
cur_len
=
info
.
max_len
max_len
=
max
(
self
.
max_len
,
cur_len
)
if
max_len
*
(
len
(
self
.
batch
)
+
1
)
>
self
.
_batch_size
:
result
=
self
.
batch
self
.
batch
=
[
info
]
self
.
max_len
=
cur_len
return
result
else
:
self
.
max_len
=
max_len
self
.
batch
.
append
(
info
)
class
SampleInfo
(
object
):
def
__init__
(
self
,
i
,
max_len
,
min_len
):
self
.
i
=
i
self
.
min_len
=
min_len
self
.
max_len
=
max_len
class
MinMaxFilter
(
object
):
def
__init__
(
self
,
max_len
,
min_len
,
underlying_creator
):
self
.
_min_len
=
min_len
self
.
_max_len
=
max_len
self
.
_creator
=
underlying_creator
def
append
(
self
,
info
):
if
info
.
max_len
>
self
.
_max_len
or
info
.
min_len
<
self
.
_min_len
:
return
else
:
return
self
.
_creator
.
append
(
info
)
@
property
def
batch
(
self
):
return
self
.
_creator
.
batch
class
DataReader
(
object
):
"""
The data reader loads all data from files and produces batches of data
in the way corresponding to settings.
An example of returning a generator producing data batches whose data
is shuffled in each pass and sorted in each pool:
```
train_data = DataReader(
src_vocab_fpath='data/src_vocab_file',
trg_vocab_fpath='data/trg_vocab_file',
fpattern='data/part-*',
use_token_batch=True,
batch_size=2000,
pool_size=10000,
sort_type=SortType.POOL,
shuffle=True,
shuffle_batch=True,
start_mark='<s>',
end_mark='<e>',
unk_mark='<unk>',
clip_last_batch=False).batch_generator
```
:param src_vocab_fpath: The path of vocabulary file of source language.
:type src_vocab_fpath: basestring
:param trg_vocab_fpath: The path of vocabulary file of target language.
:type trg_vocab_fpath: basestring
:param fpattern: The pattern to match data files.
:type fpattern: basestring
:param batch_size: The number of sequences contained in a mini-batch.
or the maximum number of tokens (include paddings) contained in a
mini-batch.
:type batch_size: int
:param pool_size: The size of pool buffer.
:type pool_size: int
:param sort_type: The grain to sort by length: 'global' for all
instances; 'pool' for instances in pool; 'none' for no sort.
:type sort_type: basestring
:param clip_last_batch: Whether to clip the last uncompleted batch.
:type clip_last_batch: bool
:param tar_fname: The data file in tar if fpattern matches a tar file.
:type tar_fname: basestring
:param min_length: The minimum length used to filt sequences.
:type min_length: int
:param max_length: The maximum length used to filt sequences.
:type max_length: int
:param shuffle: Whether to shuffle all instances.
:type shuffle: bool
:param shuffle_batch: Whether to shuffle the generated batches.
:type shuffle_batch: bool
:param use_token_batch: Whether to produce batch data according to
token number.
:type use_token_batch: bool
:param field_delimiter: The delimiter used to split source and target in
each line of data file.
:type field_delimiter: basestring
:param token_delimiter: The delimiter used to split tokens in source or
target sentences.
:type token_delimiter: basestring
:param start_mark: The token representing for the beginning of
sentences in dictionary.
:type start_mark: basestring
:param end_mark: The token representing for the end of sentences
in dictionary.
:type end_mark: basestring
:param unk_mark: The token representing for unknown word in dictionary.
:type unk_mark: basestring
:param seed: The seed for random.
:type seed: int
"""
def
__init__
(
self
,
src_vocab_fpath
,
trg_vocab_fpath
,
fpattern
,
batch_size
,
pool_size
,
sort_type
=
SortType
.
GLOBAL
,
clip_last_batch
=
True
,
tar_fname
=
None
,
min_length
=
0
,
max_length
=
100
,
shuffle
=
True
,
shuffle_batch
=
False
,
use_token_batch
=
False
,
field_delimiter
=
"
\t
"
,
token_delimiter
=
" "
,
start_mark
=
"<s>"
,
end_mark
=
"<e>"
,
unk_mark
=
"<unk>"
,
seed
=
0
):
self
.
_src_vocab
=
self
.
load_dict
(
src_vocab_fpath
)
self
.
_only_src
=
True
if
trg_vocab_fpath
is
not
None
:
self
.
_trg_vocab
=
self
.
load_dict
(
trg_vocab_fpath
)
self
.
_only_src
=
False
self
.
_pool_size
=
pool_size
self
.
_batch_size
=
batch_size
self
.
_use_token_batch
=
use_token_batch
self
.
_sort_type
=
sort_type
self
.
_clip_last_batch
=
clip_last_batch
self
.
_shuffle
=
shuffle
self
.
_shuffle_batch
=
shuffle_batch
self
.
_min_length
=
min_length
self
.
_max_length
=
max_length
self
.
_field_delimiter
=
field_delimiter
self
.
_token_delimiter
=
token_delimiter
self
.
load_src_trg_ids
(
end_mark
,
fpattern
,
start_mark
,
tar_fname
,
unk_mark
)
self
.
_random
=
random
.
Random
(
x
=
seed
)
def
load_src_trg_ids
(
self
,
end_mark
,
fpattern
,
start_mark
,
tar_fname
,
unk_mark
):
converters
=
[
Converter
(
vocab
=
self
.
_src_vocab
,
beg
=
self
.
_src_vocab
[
start_mark
],
end
=
self
.
_src_vocab
[
end_mark
],
unk
=
self
.
_src_vocab
[
unk_mark
],
delimiter
=
self
.
_token_delimiter
)
]
if
not
self
.
_only_src
:
converters
.
append
(
Converter
(
vocab
=
self
.
_trg_vocab
,
beg
=
self
.
_trg_vocab
[
start_mark
],
end
=
self
.
_trg_vocab
[
end_mark
],
unk
=
self
.
_trg_vocab
[
unk_mark
],
delimiter
=
self
.
_token_delimiter
))
converters
=
ComposedConverter
(
converters
)
self
.
_src_seq_ids
=
[]
self
.
_trg_seq_ids
=
None
if
self
.
_only_src
else
[]
self
.
_sample_infos
=
[]
for
i
,
line
in
enumerate
(
self
.
_load_lines
(
fpattern
,
tar_fname
)):
src_trg_ids
=
converters
(
line
)
self
.
_src_seq_ids
.
append
(
src_trg_ids
[
0
])
lens
=
[
len
(
src_trg_ids
[
0
])]
if
not
self
.
_only_src
:
self
.
_trg_seq_ids
.
append
(
src_trg_ids
[
1
])
lens
.
append
(
len
(
src_trg_ids
[
1
]))
self
.
_sample_infos
.
append
(
SampleInfo
(
i
,
max
(
lens
),
min
(
lens
)))
def
transformer
(
use_feed
):
def
_load_lines
(
self
,
fpattern
,
tar_fname
):
assert
not
use_feed
,
"transfomer doesn't support feed yet"
fpaths
=
glob
.
glob
(
fpattern
)
return
transformer_model
.
transformer
(
ModelHyperParams
.
src_vocab_size
+
1
,
ModelHyperParams
.
trg_vocab_size
+
1
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
dropout
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
pos_pad_idx
)
if
len
(
fpaths
)
==
1
and
tarfile
.
is_tarfile
(
fpaths
[
0
]):
if
tar_fname
is
None
:
raise
Exception
(
"If tar file provided, please set tar_fname."
)
def
get_model
():
f
=
tarfile
.
open
(
fpaths
[
0
],
"r"
)
avg_cost
=
transformer
(
use_feed
=
False
)
for
line
in
f
.
extractfile
(
tar_fname
):
optimizer
=
fluid
.
optimizer
.
Adam
()
fields
=
line
.
strip
(
"
\n
"
).
split
(
self
.
_field_delimiter
)
optimizer
.
minimize
(
avg_cost
)
if
(
not
self
.
_only_src
and
len
(
fields
)
==
2
)
or
(
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
self
.
_only_src
and
len
(
fields
)
==
1
):
return
avg_cost
yield
fields
else
:
for
fpath
in
fpaths
:
if
not
os
.
path
.
isfile
(
fpath
):
raise
IOError
(
"Invalid file: %s"
%
fpath
)
with
open
(
fpath
,
"r"
)
as
f
:
for
line
in
f
:
fields
=
line
.
strip
(
"
\n
"
).
split
(
self
.
_field_delimiter
)
if
(
not
self
.
_only_src
and
len
(
fields
)
==
2
)
or
(
self
.
_only_src
and
len
(
fields
)
==
1
):
yield
fields
@
staticmethod
def
load_dict
(
dict_path
,
reverse
=
False
):
word_dict
=
{}
with
open
(
dict_path
,
"r"
)
as
fdict
:
for
idx
,
line
in
enumerate
(
fdict
):
if
reverse
:
word_dict
[
idx
]
=
line
.
strip
(
"
\n
"
)
else
:
word_dict
[
line
.
strip
(
"
\n
"
)]
=
idx
return
word_dict
def
batch_generator
(
self
):
# global sort or global shuffle
if
self
.
_sort_type
==
SortType
.
GLOBAL
:
infos
=
sorted
(
self
.
_sample_infos
,
key
=
lambda
x
:
x
.
max_len
,
reverse
=
True
)
else
:
if
self
.
_shuffle
:
infos
=
self
.
_sample_infos
self
.
_random
.
shuffle
(
infos
)
else
:
infos
=
self
.
_sample_infos
if
self
.
_sort_type
==
SortType
.
POOL
:
for
i
in
range
(
0
,
len
(
infos
),
self
.
_pool_size
):
infos
[
i
:
i
+
self
.
_pool_size
]
=
sorted
(
infos
[
i
:
i
+
self
.
_pool_size
],
key
=
lambda
x
:
x
.
max_len
)
# concat batch
batches
=
[]
batch_creator
=
TokenBatchCreator
(
self
.
_batch_size
)
if
self
.
_use_token_batch
else
SentenceBatchCreator
(
self
.
_batch_size
)
batch_creator
=
MinMaxFilter
(
self
.
_max_length
,
self
.
_min_length
,
batch_creator
)
for
info
in
infos
:
batch
=
batch_creator
.
append
(
info
)
if
batch
is
not
None
:
batches
.
append
(
batch
)
if
not
self
.
_clip_last_batch
and
len
(
batch_creator
.
batch
)
!=
0
:
batches
.
append
(
batch_creator
.
batch
)
if
self
.
_shuffle_batch
:
self
.
_random
.
shuffle
(
batches
)
for
batch
in
batches
:
batch_ids
=
[
info
.
i
for
info
in
batch
]
if
self
.
_only_src
:
yield
[[
self
.
_src_seq_ids
[
idx
]]
for
idx
in
batch_ids
]
else
:
yield
[(
self
.
_src_seq_ids
[
idx
],
self
.
_trg_seq_ids
[
idx
][:
-
1
],
self
.
_trg_seq_ids
[
idx
][
1
:])
for
idx
in
batch_ids
]
#from transformer_model import transformer
def
position_encoding_init
(
n_position
,
d_pos_vec
):
"""
Generate the initial values for the sinusoid position encoding table.
"""
position_enc
=
np
.
array
([[
pos
/
np
.
power
(
10000
,
2
*
(
j
//
2
)
/
d_pos_vec
)
for
j
in
range
(
d_pos_vec
)
]
if
pos
!=
0
else
np
.
zeros
(
d_pos_vec
)
for
pos
in
range
(
n_position
)])
position_enc
[
1
:,
0
::
2
]
=
np
.
sin
(
position_enc
[
1
:,
0
::
2
])
# dim 2i
position_enc
[
1
:,
1
::
2
]
=
np
.
cos
(
position_enc
[
1
:,
1
::
2
])
# dim 2i+1
return
position_enc
.
astype
(
"float32"
)
def
multi_head_attention
(
queries
,
keys
,
values
,
attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
=
1
,
dropout_rate
=
0.
,
cache
=
None
):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
if
not
(
len
(
queries
.
shape
)
==
len
(
keys
.
shape
)
==
len
(
values
.
shape
)
==
3
):
raise
ValueError
(
"Inputs: quries, keys and values should all be 3-D tensors."
)
def
__compute_qkv
(
queries
,
keys
,
values
,
n_head
,
d_key
,
d_value
):
"""
Add linear projection to queries, keys, and values.
"""
q
=
layers
.
fc
(
input
=
queries
,
size
=
d_key
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
k
=
layers
.
fc
(
input
=
keys
,
size
=
d_key
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
v
=
layers
.
fc
(
input
=
values
,
size
=
d_value
*
n_head
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
return
q
,
k
,
v
def
__split_heads
(
x
,
n_head
):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
if
n_head
==
1
:
return
x
hidden_size
=
x
.
shape
[
-
1
]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped
=
layers
.
reshape
(
x
=
x
,
shape
=
[
0
,
0
,
n_head
,
hidden_size
//
n_head
])
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return
layers
.
transpose
(
x
=
reshaped
,
perm
=
[
0
,
2
,
1
,
3
])
def
__combine_heads
(
x
):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if
len
(
x
.
shape
)
==
3
:
return
x
if
len
(
x
.
shape
)
!=
4
:
raise
ValueError
(
"Input(x) should be a 4-D Tensor."
)
trans_x
=
layers
.
transpose
(
x
,
perm
=
[
0
,
2
,
1
,
3
])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return
layers
.
reshape
(
x
=
trans_x
,
shape
=
map
(
int
,
[
0
,
0
,
trans_x
.
shape
[
2
]
*
trans_x
.
shape
[
3
]]))
def
scaled_dot_product_attention
(
q
,
k
,
v
,
attn_bias
,
d_model
,
dropout_rate
):
"""
Scaled Dot-Product Attention
"""
scaled_q
=
layers
.
scale
(
x
=
q
,
scale
=
d_model
**-
0.5
)
product
=
layers
.
matmul
(
x
=
scaled_q
,
y
=
k
,
transpose_y
=
True
)
if
attn_bias
:
product
+=
attn_bias
weights
=
layers
.
softmax
(
product
)
if
dropout_rate
:
weights
=
layers
.
dropout
(
weights
,
dropout_prob
=
dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
out
=
layers
.
matmul
(
weights
,
v
)
return
out
q
,
k
,
v
=
__compute_qkv
(
queries
,
keys
,
values
,
n_head
,
d_key
,
d_value
)
if
cache
is
not
None
:
# use cache and concat time steps
k
=
cache
[
"k"
]
=
layers
.
concat
([
cache
[
"k"
],
k
],
axis
=
1
)
v
=
cache
[
"v"
]
=
layers
.
concat
([
cache
[
"v"
],
v
],
axis
=
1
)
q
=
__split_heads
(
q
,
n_head
)
k
=
__split_heads
(
k
,
n_head
)
v
=
__split_heads
(
v
,
n_head
)
ctx_multiheads
=
scaled_dot_product_attention
(
q
,
k
,
v
,
attn_bias
,
d_model
,
dropout_rate
)
out
=
__combine_heads
(
ctx_multiheads
)
# Project back to the model size.
proj_out
=
layers
.
fc
(
input
=
out
,
size
=
d_model
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
return
proj_out
def
positionwise_feed_forward
(
x
,
d_inner_hid
,
d_hid
):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden
=
layers
.
fc
(
input
=
x
,
size
=
d_inner_hid
,
num_flatten_dims
=
2
,
act
=
"relu"
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
out
=
layers
.
fc
(
input
=
hidden
,
size
=
d_hid
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
return
out
def
pre_post_process_layer
(
prev_out
,
out
,
process_cmd
,
dropout_rate
=
0.
):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for
cmd
in
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
out
=
out
+
prev_out
if
prev_out
else
out
elif
cmd
==
"n"
:
# add layer normalization
out
=
layers
.
layer_norm
(
out
,
begin_norm_axis
=
len
(
out
.
shape
)
-
1
,
param_attr
=
fluid
.
initializer
.
Constant
(
1.
),
bias_attr
=
fluid
.
initializer
.
Constant
(
0.
))
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
out
=
layers
.
dropout
(
out
,
dropout_prob
=
dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
return
out
pre_process_layer
=
partial
(
pre_post_process_layer
,
None
)
post_process_layer
=
pre_post_process_layer
def
prepare_encoder
(
src_word
,
src_pos
,
src_vocab_size
,
src_emb_dim
,
src_max_len
,
dropout_rate
=
0.
,
word_emb_param_name
=
None
,
pos_enc_param_name
=
None
):
"""Add word embeddings and position encodings.
The output tensor has a shape of:
[batch_size, max_src_length_in_batch, d_model].
This module is used at the bottom of the encoder stacks.
"""
if
TrainTaskConfig
.
check_acc
:
src_word_emb
=
layers
.
embedding
(
src_word
,
size
=
[
src_vocab_size
,
src_emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
word_emb_param_name
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
(
0.001
)))
else
:
src_word_emb
=
layers
.
embedding
(
src_word
,
size
=
[
src_vocab_size
,
src_emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
word_emb_param_name
,
initializer
=
fluid
.
initializer
.
Normal
(
0.
,
src_emb_dim
**-
0.5
)))
src_word_emb
=
layers
.
scale
(
x
=
src_word_emb
,
scale
=
src_emb_dim
**
0.5
)
src_pos_enc
=
layers
.
embedding
(
src_pos
,
size
=
[
src_max_len
,
src_emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
pos_enc_param_name
,
trainable
=
False
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
(
0.001
)))
enc_input
=
src_word_emb
+
src_pos_enc
return
layers
.
dropout
(
enc_input
,
dropout_prob
=
dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
if
dropout_rate
else
enc_input
prepare_encoder
=
partial
(
prepare_encoder
,
pos_enc_param_name
=
pos_enc_param_names
[
0
])
prepare_decoder
=
partial
(
prepare_encoder
,
pos_enc_param_name
=
pos_enc_param_names
[
1
])
def
encoder_layer
(
enc_input
,
attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
=
0.
):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output
=
multi_head_attention
(
enc_input
,
enc_input
,
enc_input
,
attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
,
dropout_rate
)
attn_output
=
post_process_layer
(
enc_input
,
attn_output
,
"dan"
,
dropout_rate
)
ffd_output
=
positionwise_feed_forward
(
attn_output
,
d_inner_hid
,
d_model
)
return
post_process_layer
(
attn_output
,
ffd_output
,
"dan"
,
dropout_rate
)
def
encoder
(
enc_input
,
attn_bias
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
=
0.
):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for
i
in
range
(
n_layer
):
enc_output
=
encoder_layer
(
enc_input
,
attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
)
enc_input
=
enc_output
return
enc_output
def
decoder_layer
(
dec_input
,
enc_output
,
slf_attn_bias
,
dec_enc_attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
=
0.
,
cache
=
None
):
""" The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention.
"""
slf_attn_output
=
multi_head_attention
(
dec_input
,
dec_input
,
dec_input
,
slf_attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
,
dropout_rate
,
cache
,
)
slf_attn_output
=
post_process_layer
(
dec_input
,
slf_attn_output
,
"dan"
,
# residual connection + dropout + layer normalization
dropout_rate
,
)
enc_attn_output
=
multi_head_attention
(
slf_attn_output
,
enc_output
,
enc_output
,
dec_enc_attn_bias
,
d_key
,
d_value
,
d_model
,
n_head
,
dropout_rate
,
)
enc_attn_output
=
post_process_layer
(
slf_attn_output
,
enc_attn_output
,
"dan"
,
# residual connection + dropout + layer normalization
dropout_rate
,
)
ffd_output
=
positionwise_feed_forward
(
enc_attn_output
,
d_inner_hid
,
d_model
,
)
dec_output
=
post_process_layer
(
enc_attn_output
,
ffd_output
,
"dan"
,
# residual connection + dropout + layer normalization
dropout_rate
,
)
return
dec_output
def
decoder
(
dec_input
,
enc_output
,
dec_slf_attn_bias
,
dec_enc_attn_bias
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
=
0.
,
caches
=
None
):
"""
The decoder is composed of a stack of identical decoder_layer layers.
"""
for
i
in
range
(
n_layer
):
cache
=
None
if
caches
is
not
None
:
cache
=
caches
[
i
]
dec_output
=
decoder_layer
(
dec_input
,
enc_output
,
dec_slf_attn_bias
,
dec_enc_attn_bias
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
cache
=
cache
)
dec_input
=
dec_output
return
dec_output
def
make_all_inputs
(
input_fields
):
"""
Define the input data layers for the transformer model.
"""
inputs
=
[]
for
input_field
in
input_fields
:
input_var
=
layers
.
data
(
name
=
input_field
,
shape
=
input_descs
[
input_field
][
0
],
dtype
=
input_descs
[
input_field
][
1
],
lod_level
=
input_descs
[
input_field
][
2
]
if
len
(
input_descs
[
input_field
])
==
3
else
0
,
append_batch_size
=
False
)
inputs
.
append
(
input_var
)
return
inputs
def
transformer
(
src_vocab_size
,
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
label_smooth_eps
,
):
if
weight_sharing
:
assert
src_vocab_size
==
src_vocab_size
,
(
"Vocabularies in source and target should be same for weight sharing."
)
enc_inputs
=
make_all_inputs
(
encoder_data_input_fields
)
enc_output
=
wrap_encoder
(
src_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
enc_inputs
,
)
dec_inputs
=
make_all_inputs
(
decoder_data_input_fields
[:
-
1
])
predict
=
wrap_decoder
(
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
dec_inputs
,
enc_output
,
)
# Padding index do not contribute to the total loss. The weights is used to
# cancel padding index in calculating the loss.
label
,
weights
=
make_all_inputs
(
label_data_input_fields
)
if
label_smooth_eps
:
label
=
layers
.
label_smooth
(
label
=
layers
.
one_hot
(
input
=
label
,
depth
=
trg_vocab_size
),
epsilon
=
label_smooth_eps
)
cost
=
layers
.
softmax_with_cross_entropy
(
logits
=
layers
.
reshape
(
predict
,
shape
=
[
-
1
,
trg_vocab_size
]),
label
=
label
,
soft_label
=
True
if
label_smooth_eps
else
False
)
weighted_cost
=
cost
*
weights
sum_cost
=
layers
.
reduce_sum
(
weighted_cost
)
token_num
=
layers
.
reduce_sum
(
weights
)
avg_cost
=
sum_cost
/
token_num
avg_cost
.
stop_gradient
=
True
return
sum_cost
,
avg_cost
,
predict
,
token_num
def
wrap_encoder
(
src_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
enc_inputs
=
None
):
"""
The wrapper assembles together all needed layers for the encoder.
"""
if
enc_inputs
is
None
:
# This is used to implement independent encoder program in inference.
src_word
,
src_pos
,
src_slf_attn_bias
=
\
make_all_inputs
(
encoder_data_input_fields
)
else
:
src_word
,
src_pos
,
src_slf_attn_bias
=
\
enc_inputs
enc_input
=
prepare_encoder
(
src_word
,
src_pos
,
src_vocab_size
,
d_model
,
max_length
,
dropout_rate
,
word_emb_param_name
=
word_emb_param_names
[
0
])
enc_output
=
encoder
(
enc_input
,
src_slf_attn_bias
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
)
return
enc_output
def
wrap_decoder
(
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
dec_inputs
=
None
,
enc_output
=
None
,
caches
=
None
):
"""
The wrapper assembles together all needed layers for the decoder.
"""
if
dec_inputs
is
None
:
# This is used to implement independent decoder program in inference.
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
\
enc_output
=
make_all_inputs
(
decoder_data_input_fields
+
decoder_util_input_fields
)
else
:
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_src_attn_bias
=
dec_inputs
dec_input
=
prepare_decoder
(
trg_word
,
trg_pos
,
trg_vocab_size
,
d_model
,
max_length
,
dropout_rate
,
word_emb_param_name
=
word_emb_param_names
[
0
]
if
weight_sharing
else
word_emb_param_names
[
1
])
dec_output
=
decoder
(
dec_input
,
enc_output
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
caches
=
caches
)
# Return logits for training and probs for inference.
if
weight_sharing
:
predict
=
layers
.
matmul
(
x
=
dec_output
,
y
=
fluid
.
get_var
(
word_emb_param_names
[
0
]),
transpose_y
=
True
)
else
:
predict
=
layers
.
fc
(
input
=
dec_output
,
size
=
trg_vocab_size
,
num_flatten_dims
=
2
,
param_attr
=
const_para_attr
,
bias_attr
=
const_bias_attr
)
if
dec_inputs
is
None
:
predict
=
layers
.
softmax
(
predict
)
return
predict
def
fast_decode
(
src_vocab_size
,
trg_vocab_size
,
max_in_len
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
beam_size
,
max_out_len
,
eos_idx
,
):
"""
Use beam search to decode. Caches will be used to store states of history
steps which can make the decoding faster.
"""
enc_output
=
wrap_encoder
(
src_vocab_size
,
max_in_len
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
)
start_tokens
,
init_scores
,
trg_src_attn_bias
=
\
make_all_inputs
(
fast_decoder_data_input_fields
)
def
beam_search
():
max_len
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
start_tokens
.
dtype
,
value
=
max_out_len
)
step_idx
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
start_tokens
.
dtype
,
value
=
0
)
cond
=
layers
.
less_than
(
x
=
step_idx
,
y
=
max_len
)
while_op
=
layers
.
While
(
cond
)
# array states will be stored for each step.
ids
=
layers
.
array_write
(
layers
.
reshape
(
start_tokens
,
(
-
1
,
1
)),
step_idx
)
scores
=
layers
.
array_write
(
init_scores
,
step_idx
)
# cell states will be overwrited at each step.
# caches contains states of history steps to reduce redundant
# computation in decoder.
caches
=
[{
"k"
:
layers
.
fill_constant_batch_size_like
(
input
=
start_tokens
,
shape
=
[
-
1
,
0
,
d_model
],
dtype
=
enc_output
.
dtype
,
value
=
0
),
"v"
:
layers
.
fill_constant_batch_size_like
(
input
=
start_tokens
,
shape
=
[
-
1
,
0
,
d_model
],
dtype
=
enc_output
.
dtype
,
value
=
0
)
}
for
i
in
range
(
n_layer
)]
with
while_op
.
block
():
pre_ids
=
layers
.
array_read
(
array
=
ids
,
i
=
step_idx
)
pre_ids
=
layers
.
reshape
(
pre_ids
,
(
-
1
,
1
,
1
))
pre_scores
=
layers
.
array_read
(
array
=
scores
,
i
=
step_idx
)
# sequence_expand can gather sequences according to lod thus can be
# used in beam search to sift states corresponding to selected ids.
pre_src_attn_bias
=
layers
.
sequence_expand
(
x
=
trg_src_attn_bias
,
y
=
pre_scores
)
pre_enc_output
=
layers
.
sequence_expand
(
x
=
enc_output
,
y
=
pre_scores
)
pre_caches
=
[{
"k"
:
layers
.
sequence_expand
(
x
=
cache
[
"k"
],
y
=
pre_scores
),
"v"
:
layers
.
sequence_expand
(
x
=
cache
[
"v"
],
y
=
pre_scores
),
}
for
cache
in
caches
]
pre_pos
=
layers
.
elementwise_mul
(
x
=
layers
.
fill_constant_batch_size_like
(
input
=
pre_enc_output
,
# cann't use pre_ids here since it has lod
value
=
1
,
shape
=
[
-
1
,
1
,
1
],
dtype
=
pre_ids
.
dtype
),
y
=
layers
.
increment
(
x
=
step_idx
,
value
=
1.0
,
in_place
=
False
),
axis
=
0
)
logits
=
wrap_decoder
(
trg_vocab_size
,
max_in_len
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
dropout_rate
,
weight_sharing
,
dec_inputs
=
(
pre_ids
,
pre_pos
,
None
,
pre_src_attn_bias
),
enc_output
=
pre_enc_output
,
caches
=
pre_caches
)
logits
=
layers
.
reshape
(
logits
,
(
-
1
,
trg_vocab_size
))
topk_scores
,
topk_indices
=
layers
.
topk
(
input
=
layers
.
softmax
(
logits
),
k
=
beam_size
)
accu_scores
=
layers
.
elementwise_add
(
x
=
layers
.
log
(
topk_scores
),
y
=
layers
.
reshape
(
pre_scores
,
shape
=
[
-
1
]),
axis
=
0
)
# beam_search op uses lod to distinguish branches.
topk_indices
=
layers
.
lod_reset
(
topk_indices
,
pre_ids
)
selected_ids
,
selected_scores
=
layers
.
beam_search
(
pre_ids
=
pre_ids
,
pre_scores
=
pre_scores
,
ids
=
topk_indices
,
scores
=
accu_scores
,
beam_size
=
beam_size
,
end_id
=
eos_idx
)
layers
.
increment
(
x
=
step_idx
,
value
=
1.0
,
in_place
=
True
)
# update states
layers
.
array_write
(
selected_ids
,
i
=
step_idx
,
array
=
ids
)
layers
.
array_write
(
selected_scores
,
i
=
step_idx
,
array
=
scores
)
layers
.
assign
(
pre_src_attn_bias
,
trg_src_attn_bias
)
layers
.
assign
(
pre_enc_output
,
enc_output
)
for
i
in
range
(
n_layer
):
layers
.
assign
(
pre_caches
[
i
][
"k"
],
caches
[
i
][
"k"
])
layers
.
assign
(
pre_caches
[
i
][
"v"
],
caches
[
i
][
"v"
])
length_cond
=
layers
.
less_than
(
x
=
step_idx
,
y
=
max_len
)
finish_cond
=
layers
.
logical_not
(
layers
.
is_empty
(
x
=
selected_ids
))
layers
.
logical_and
(
x
=
length_cond
,
y
=
finish_cond
,
out
=
cond
)
finished_ids
,
finished_scores
=
layers
.
beam_search_decode
(
ids
,
scores
,
beam_size
=
beam_size
,
end_id
=
eos_idx
)
return
finished_ids
,
finished_scores
finished_ids
,
finished_scores
=
beam_search
()
return
finished_ids
,
finished_scores
def
get_model
(
is_dist
,
is_async
):
sum_cost
,
avg_cost
,
predict
,
token_num
=
transformer
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
dropout
,
ModelHyperParams
.
weight_sharing
,
TrainTaskConfig
.
label_smooth_eps
)
local_lr_scheduler
=
LearningRateScheduler
(
ModelHyperParams
.
d_model
,
TrainTaskConfig
.
warmup_steps
,
TrainTaskConfig
.
learning_rate
)
if
not
is_dist
:
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
local_lr_scheduler
.
learning_rate
,
beta1
=
TrainTaskConfig
.
beta1
,
beta2
=
TrainTaskConfig
.
beta2
,
epsilon
=
TrainTaskConfig
.
eps
)
optimizer
.
minimize
(
sum_cost
)
elif
is_async
:
optimizer
=
fluid
.
optimizer
.
SGD
(
0.003
)
optimizer
.
minimize
(
sum_cost
)
else
:
lr_decay
=
fluid
.
layers
\
.
learning_rate_scheduler
\
.
noam_decay
(
ModelHyperParams
.
d_model
,
TrainTaskConfig
.
warmup_steps
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
lr_decay
,
beta1
=
TrainTaskConfig
.
beta1
,
beta2
=
TrainTaskConfig
.
beta2
,
epsilon
=
TrainTaskConfig
.
eps
)
optimizer
.
minimize
(
sum_cost
)
return
sum_cost
,
avg_cost
,
predict
,
token_num
,
local_lr_scheduler
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
...
@@ -176,10 +1677,23 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
...
@@ -176,10 +1677,23 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
return
t
return
t
class
DistTransformer2x2
(
object
):
def
update_args
():
src_dict
=
DataReader
.
load_dict
(
TrainTaskConfig
.
src_vocab_fpath
)
trg_dict
=
DataReader
.
load_dict
(
TrainTaskConfig
.
trg_vocab_fpath
)
dict_args
=
[
"src_vocab_size"
,
str
(
len
(
src_dict
)),
"trg_vocab_size"
,
str
(
len
(
trg_dict
)),
"bos_idx"
,
str
(
src_dict
[
TrainTaskConfig
.
special_token
[
0
]]),
"eos_idx"
,
str
(
src_dict
[
TrainTaskConfig
.
special_token
[
1
]]),
"unk_idx"
,
str
(
src_dict
[
TrainTaskConfig
.
special_token
[
2
]])
]
merge_cfg_from_list
(
dict_args
,
[
TrainTaskConfig
,
ModelHyperParams
])
class
DistTransformer2x2
(
TestDistRunnerBase
):
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
trainer_id
):
trainer_id
,
sync_mode
):
get_model
()
get_model
(
True
,
not
sync_mode
)
t
=
get_transpiler
(
trainer_id
,
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
pserver_endpoints
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
trainers
)
...
@@ -196,7 +1710,6 @@ class DistTransformer2x2(object):
...
@@ -196,7 +1710,6 @@ class DistTransformer2x2(object):
while
True
:
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
time
.
sleep
(
3
)
print
(
"waiting ps ready: "
,
pid
)
try
:
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
# on the /tmp directory until it was ready to process all the RPC call.
...
@@ -205,63 +1718,35 @@ class DistTransformer2x2(object):
...
@@ -205,63 +1718,35 @@ class DistTransformer2x2(object):
except
os
.
error
:
except
os
.
error
:
retry_times
-=
1
retry_times
-=
1
def
run_trainer
(
self
,
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
):
def
run_trainer
(
self
,
avg_cost
=
get_model
()
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
,
sync_mode
=
True
):
sum_cost
,
avg_cost
,
predict
,
token_num
,
local_lr_scheduler
=
get_model
(
is_dist
,
not
sync_mode
)
if
is_dist
:
if
is_dist
:
t
=
get_transpiler
(
trainer_id
,
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
endpoints
,
fluid
.
default_main_program
(),
endpoints
,
trainers
)
trainers
)
trainer_prog
=
t
.
get_trainer_program
()
trainer_prog
=
t
.
get_trainer_program
()
TrainTaskConfig
.
batch_size
=
10
TrainTaskConfig
.
train_file_pattern
=
TrainTaskConfig
.
data_path
+
"train.tok.clean.bpe.32000.en-de.train_{}"
.
format
(
trainer_id
)
else
:
else
:
TrainTaskConfig
.
batch_size
=
20
trainer_prog
=
fluid
.
default_main_program
()
trainer_prog
=
fluid
.
default_main_program
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
TrainTaskConfig
.
local
=
not
is_dist
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
train_loop
(
startup_exe
,
trainer_prog
,
1
,
sum_cost
,
avg_cost
,
exe
=
fluid
.
ParallelExecutor
(
local_lr_scheduler
,
token_num
,
predict
)
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
)
first_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
first_loss
)
for
i
in
six
.
moves
.
xrange
(
5
):
_
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
last_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
last_loss
)
def
main
(
role
=
"pserver"
,
endpoints
=
"127.0.0.1:9123"
,
trainer_id
=
0
,
current_endpoint
=
"127.0.0.1:9123"
,
trainers
=
1
,
is_dist
=
True
):
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
batch_size
=
transformer_model
.
batch_size
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
WMT16_RECORDIO_FILE
)
as
writer
:
for
batch
in
reader
():
for
tensor
in
prepare_batch_input
(
batch
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
n_head
):
t
=
fluid
.
LoDTensor
()
t
.
set
(
tensor
,
fluid
.
CPUPlace
())
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
model
=
DistTransformer2x2
()
if
role
==
"pserver"
:
model
.
run_pserver
(
endpoints
,
trainers
,
current_endpoint
,
trainer_id
)
else
:
p
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model
.
run_trainer
(
p
,
endpoints
,
trainer_id
,
trainers
,
is_dist
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
@@ -269,18 +1754,6 @@ if __name__ == "__main__":
...
@@ -269,18 +1754,6 @@ if __name__ == "__main__":
print
(
print
(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
)
)
role
=
sys
.
argv
[
1
]
endpoints
=
sys
.
argv
[
2
]
update_args
()
trainer_id
=
int
(
sys
.
argv
[
3
])
runtime_main
(
DistTransformer2x2
)
current_endpoint
=
sys
.
argv
[
4
]
trainers
=
int
(
sys
.
argv
[
5
])
is_dist
=
True
if
sys
.
argv
[
6
]
==
"TRUE"
else
False
# FIXME(typhoonzero): refine this test.
is_async
=
True
if
sys
.
argv
[
7
]
==
"TRUE"
else
False
main
(
role
=
role
,
endpoints
=
endpoints
,
trainer_id
=
trainer_id
,
current_endpoint
=
current_endpoint
,
trainers
=
trainers
,
is_dist
=
is_dist
)
python/paddle/fluid/tests/unittests/dist_word2vec.py
浏览文件 @
b98b7440
...
@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
...
@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
dtype
=
'float32'
,
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_second
=
fluid
.
layers
.
embedding
(
embed_second
=
fluid
.
layers
.
embedding
(
input
=
words
[
1
],
input
=
words
[
1
],
size
=
[
dict_size
,
EMBED_SIZE
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_third
=
fluid
.
layers
.
embedding
(
embed_third
=
fluid
.
layers
.
embedding
(
input
=
words
[
2
],
input
=
words
[
2
],
size
=
[
dict_size
,
EMBED_SIZE
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
embed_forth
=
fluid
.
layers
.
embedding
(
embed_forth
=
fluid
.
layers
.
embedding
(
input
=
words
[
3
],
input
=
words
[
3
],
size
=
[
dict_size
,
EMBED_SIZE
],
size
=
[
dict_size
,
EMBED_SIZE
],
dtype
=
'float32'
,
dtype
=
'float32'
,
is_sparse
=
IS_SPARSE
,
is_sparse
=
IS_SPARSE
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
()))
name
=
'shared_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
concat_embed
=
fluid
.
layers
.
concat
(
concat_embed
=
fluid
.
layers
.
concat
(
input
=
[
embed_first
,
embed_second
,
embed_third
,
embed_forth
],
input
=
[
embed_first
,
embed_second
,
embed_third
,
embed_forth
],
...
@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
...
@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
size
=
HIDDEN_SIZE
,
size
=
HIDDEN_SIZE
,
act
=
'sigmoid'
,
act
=
'sigmoid'
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
()))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
predict_word
=
fluid
.
layers
.
fc
(
predict_word
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
input
=
hidden1
,
size
=
dict_size
,
size
=
dict_size
,
act
=
'softmax'
,
act
=
'softmax'
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
()))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
cost
=
fluid
.
layers
.
cross_entropy
(
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict_word
,
label
=
words
[
4
])
input
=
predict_word
,
label
=
words
[
4
])
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
b98b7440
...
@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
...
@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
main
.
global_block
().
append_op
(
main
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
type
=
"fetch_barrier"
,
inputs
=
{},
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
[]
},
attrs
=
{
attrs
=
{
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
python/paddle/fluid/tests/unittests/test_dist_transformer.py
浏览文件 @
b98b7440
...
@@ -15,17 +15,55 @@
...
@@ -15,17 +15,55 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
import
paddle
from
test_dist_base
import
TestDistBase
from
test_dist_base
import
TestDistBase
class
TestDistTransformer2x2
(
TestDistBase
):
def
download_files
():
url_prefix
=
'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
vocab_url
=
url_prefix
+
'vocab.bpe.32000'
vocab_md5
=
'a86d345ca6e27f6591d0dccb1b9be853'
paddle
.
dataset
.
common
.
download
(
vocab_url
,
'test_dist_transformer'
,
vocab_md5
)
local_train_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de'
local_train_md5
=
'033eb02b9449e6dd823f050782ac8914'
paddle
.
dataset
.
common
.
download
(
local_train_url
,
'test_dist_transformer'
,
local_train_md5
)
train0_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_0'
train0_md5
=
'ddce7f602f352a0405267285379a38b1'
paddle
.
dataset
.
common
.
download
(
train0_url
,
'test_dist_transformer'
,
train0_md5
)
train1_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_1'
train1_md5
=
'8757798200180285b1a619cd7f408747'
paddle
.
dataset
.
common
.
download
(
train1_url
,
'test_dist_transformer'
,
train1_md5
)
test_url
=
url_prefix
+
'newstest2013.tok.bpe.32000.en-de'
test_md5
=
'9dd74a266dbdb25314183899f269b4a2'
paddle
.
dataset
.
common
.
download
(
test_url
,
'test_dist_transformer'
,
test_md5
)
class
TestDistTransformer2x2Sync
(
TestDistBase
):
def
_setup_config
(
self
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_sync_mode
=
True
def
test_transformer
(
self
):
def
test_transformer
(
self
):
# TODO(paddle-dev): check if the delta is OK.
download_files
()
# Usually start around ~8000 and converge to ~5000
#Note: loss on test dataset of the first 5 batch are:
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
400
)
# 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1e-7
)
class
TestDistTransformer2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_transformer
(
self
):
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1.0
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_word2vec.py
浏览文件 @
b98b7440
...
@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase):
...
@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase):
self
.
_sync_mode
=
True
self
.
_sync_mode
=
True
def
test_se_resnext
(
self
):
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_word2vec.py"
,
delta
=
1e-
7
)
self
.
check_with_place
(
"dist_word2vec.py"
,
delta
=
1e-
4
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
...
...
python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
浏览文件 @
b98b7440
...
@@ -20,41 +20,50 @@ import math
...
@@ -20,41 +20,50 @@ import math
from
op_test
import
OpTest
from
op_test
import
OpTest
def
quantize_max_abs
(
x
,
num_bits
):
def
quantize_max_abs
(
x
,
max_range
):
range
=
math
.
pow
(
2
,
num_bits
)
-
1
scale
=
np
.
max
(
np
.
abs
(
x
).
flatten
())
scale
=
np
.
max
(
np
.
abs
(
x
).
flatten
())
y
=
np
.
round
(
x
/
scale
*
range
)
y
=
np
.
round
(
x
/
scale
*
max_
range
)
return
y
,
scale
return
y
,
scale
def
dequantize_max_abs
(
x
,
num_bits
,
scale
):
def
dequantize_max_abs
(
x
,
scale
,
max_range
):
range
=
math
.
pow
(
2
,
num_bits
)
-
1
y
=
(
scale
/
max_range
)
*
x
y
=
(
scale
/
range
)
*
x
return
y
return
y
class
TestFakeDequantizeMaxAbsOp
(
OpTest
):
class
TestFakeDequantizeMaxAbsOp
(
OpTest
):
def
set_args
(
self
):
def
set_args
(
self
):
self
.
num_bits
=
8
self
.
num_bits
=
8
self
.
max_range
=
math
.
pow
(
2
,
self
.
num_bits
-
1
)
-
1
self
.
data_type
=
"float32"
def
setUp
(
self
):
def
setUp
(
self
):
self
.
set_args
()
self
.
set_args
()
self
.
op_type
=
"fake_dequantize_max_abs"
self
.
op_type
=
"fake_dequantize_max_abs"
x
=
np
.
random
.
randn
(
31
,
65
).
astype
(
"float32"
)
x
=
np
.
random
.
randn
(
31
,
65
).
astype
(
self
.
data_type
)
yq
,
scale
=
quantize_max_abs
(
x
,
self
.
num_bits
)
yq
,
scale
=
quantize_max_abs
(
x
,
self
.
max_range
)
ydq
=
dequantize_max_abs
(
yq
,
s
elf
.
num_bits
,
scal
e
)
ydq
=
dequantize_max_abs
(
yq
,
s
cale
,
self
.
max_rang
e
)
self
.
inputs
=
{
'X'
:
yq
}
self
.
inputs
=
{
'X'
:
yq
,
'Scale'
:
np
.
array
(
scale
).
astype
(
self
.
data_type
)
}
self
.
attrs
=
{
'
num_bits'
:
self
.
num_bits
,
'scale'
:
float
(
scale
)
}
self
.
attrs
=
{
'
max_range'
:
self
.
max_range
}
self
.
outputs
=
{
'Out'
:
ydq
}
self
.
outputs
=
{
'Out'
:
ydq
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
class
TestFakeDequantizeMaxAbsOp5Bits
(
OpTest
):
class
TestFakeDequantizeMaxAbsOpDouble
(
TestFakeDequantizeMaxAbsOp
):
def
set_args
(
self
):
self
.
num_bits
=
8
self
.
max_range
=
math
.
pow
(
2
,
self
.
num_bits
-
1
)
-
1
self
.
data_type
=
"float64"
class
TestFakeDequantizeMaxAbsOp5Bits
(
TestFakeDequantizeMaxAbsOp
):
def
set_args
(
self
):
def
set_args
(
self
):
self
.
num_bits
=
5
self
.
num_bits
=
5
self
.
max_range
=
math
.
pow
(
2
,
self
.
num_bits
-
1
)
-
1
self
.
data_type
=
"float32"
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
math
from
op_test
import
OpTest
from
test_gru_op
import
gru
from
test_fusion_lstm_op
import
fc
,
ACTIVATION
def
fusion_gru
(
x
,
# T x M
lod
,
# 1 x N
h0
,
# N x D
wx
,
# M x 3D
wh
,
# D x 3D
bias
,
# 1 x 3D
is_reverse
,
act_state
,
act_gate
):
return
gru
(
fc
(
x
,
wx
,
bias
),
lod
,
h0
,
wh
,
np
.
zeros
(
(
1
,
wh
.
shape
[
1
]),
dtype
=
'float64'
),
is_reverse
,
act_state
,
act_gate
)
class
TestFusionGRUOp
(
OpTest
):
def
set_confs
(
self
):
pass
def
setUp
(
self
):
self
.
op_type
=
"fusion_gru"
self
.
lod
=
[[
2
,
4
,
3
]]
self
.
M
=
3
self
.
D
=
5
self
.
is_reverse
=
False
self
.
with_h0
=
True
self
.
with_bias
=
True
self
.
act_state
=
'tanh'
self
.
act_gate
=
'sigmoid'
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
x
=
np
.
random
.
rand
(
T
,
self
.
M
).
astype
(
'float64'
)
wx
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float64'
)
wh
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float64'
)
bias
=
np
.
random
.
rand
(
1
,
3
*
self
.
D
).
astype
(
'float64'
)
if
self
.
with_bias
else
np
.
zeros
(
(
1
,
3
*
self
.
D
),
dtype
=
'float64'
)
h0
=
np
.
random
.
rand
(
N
,
self
.
D
).
astype
(
'float64'
)
if
self
.
with_h0
else
np
.
zeros
(
(
N
,
self
.
D
),
dtype
=
'float64'
)
_
,
_
,
_
,
hidden
=
fusion_gru
(
x
,
self
.
lod
,
h0
,
wx
,
wh
,
bias
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
])
self
.
inputs
=
{
'X'
:
(
x
,
self
.
lod
),
'WeightX'
:
wx
,
'WeightH'
:
wh
}
if
self
.
with_bias
:
self
.
inputs
[
'Bias'
]
=
bias
if
self
.
with_h0
:
self
.
inputs
[
'H0'
]
=
h0
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
)}
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'gate_activation'
:
self
.
act_gate
,
'is_reverse'
:
self
.
is_reverse
}
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-8
)
class
TestFusionGRUOpNoInitial
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
with_h0
=
False
class
TestFusionGRUOpNoBias
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
with_bias
=
False
class
TestFusionGRUOpReverse
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
class
TestFusionGRUOpMD1
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
36
self
.
D
=
8
class
TestFusionGRUOpMD2
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
8
self
.
D
=
8
class
TestFusionGRUOpBS1
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
lod
=
[[
3
]]
self
.
D
=
16
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
浏览文件 @
b98b7440
...
@@ -43,13 +43,13 @@ def fusion_lstm(
...
@@ -43,13 +43,13 @@ def fusion_lstm(
act_cell
,
act_cand
)
act_cell
,
act_cand
)
class
Test
Lstm
Op
(
OpTest
):
class
Test
FusionLSTM
Op
(
OpTest
):
def
set_
argument
(
self
):
def
set_
conf
(
self
):
self
.
lod
=
[[
2
,
3
,
2
]]
pass
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
'fusion_lstm'
self
.
op_type
=
'fusion_lstm'
self
.
lod
=
[[
2
,
3
,
2
]]
self
.
lod
=
[[
2
,
3
,
5
,
4
]]
self
.
M
=
8
self
.
M
=
8
self
.
D
=
16
self
.
D
=
16
self
.
has_initial_state
=
False
self
.
has_initial_state
=
False
...
@@ -58,33 +58,33 @@ class TestLstmOp(OpTest):
...
@@ -58,33 +58,33 @@ class TestLstmOp(OpTest):
self
.
act_cell
=
'tanh'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
use_peepholes
=
False
self
.
use_peepholes
=
False
self
.
set_
argument
()
self
.
set_
conf
()
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
x
=
np
.
random
.
normal
(
size
=
(
T
,
self
.
M
)).
astype
(
'float
64
'
)
x
=
np
.
random
.
normal
(
size
=
(
T
,
self
.
M
)).
astype
(
'float
32
'
)
if
self
.
has_initial_state
:
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float
64
'
)
h0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float
32
'
)
c0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float
64
'
)
c0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float
32
'
)
else
:
else
:
h0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float
64
'
)
h0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float
32
'
)
c0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float
64
'
)
c0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float
32
'
)
wh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
4
*
self
.
D
)).
astype
(
'float
64
'
)
wh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
4
*
self
.
D
)).
astype
(
'float
32
'
)
if
self
.
use_peepholes
:
if
self
.
use_peepholes
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
self
.
D
)).
astype
(
'float
64
'
)
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
self
.
D
)).
astype
(
'float
32
'
)
else
:
else
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float
64
'
)
b
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float
32
'
)
w_b
=
np
.
copy
(
b
[:,
0
:
4
*
self
.
D
])
w_b
=
np
.
copy
(
b
[:,
0
:
4
*
self
.
D
])
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
# this is the weight of fc
# this is the weight of fc
wx
=
np
.
random
.
normal
(
size
=
(
self
.
M
,
4
*
self
.
D
)).
astype
(
'float
64
'
)
wx
=
np
.
random
.
normal
(
size
=
(
self
.
M
,
4
*
self
.
D
)).
astype
(
'float
32
'
)
# this is the bias of fc
# this is the bias of fc
# and it should be manually added into the bias of this fusion LSTM
# and it should be manually added into the bias of this fusion LSTM
bx
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float
64
'
)
bx
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float
32
'
)
b
[
0
,
0
:
4
*
self
.
D
]
+=
bx
[
0
,
:]
b
[
0
,
0
:
4
*
self
.
D
]
+=
bx
[
0
,
:]
h
,
c
=
fusion_lstm
(
x
,
self
.
lod
,
wx
,
bx
,
h0
,
c0
,
wh
,
w_b
,
w_c
,
h
,
c
=
fusion_lstm
(
x
,
self
.
lod
,
wx
,
bx
,
h0
,
c0
,
wh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
...
@@ -114,35 +114,45 @@ class TestLstmOp(OpTest):
...
@@ -114,35 +114,45 @@ class TestLstmOp(OpTest):
}
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-8
)
self
.
check_output
()
class
TestLstmOpInitReverse
(
TestLstmOp
):
class
TestFusionLSTMOpInit
(
TestFusionLSTMOp
):
def
set_argument
(
self
):
def
set_conf
(
self
):
self
.
has_initial_state
=
True
class
TestFusionLSTMOpReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
is_reverse
=
True
class
TestFusionLSTMOpInitReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
has_initial_state
=
True
self
.
has_initial_state
=
True
self
.
is_reverse
=
True
self
.
is_reverse
=
True
class
Test
LstmOpMD1
(
TestLstm
Op
):
class
Test
FusionLSTMOpMD1
(
TestFusionLSTM
Op
):
def
set_
argument
(
self
):
def
set_
conf
(
self
):
self
.
M
=
36
self
.
M
=
36
self
.
D
=
8
self
.
D
=
8
class
Test
LstmOpMD2
(
TestLstm
Op
):
class
Test
FusionLSTMOpMD2
(
TestFusionLSTM
Op
):
def
set_
argument
(
self
):
def
set_
conf
(
self
):
self
.
M
=
8
self
.
M
=
8
self
.
D
=
8
self
.
D
=
8
class
Test
LstmOpMD3
(
TestLstm
Op
):
class
Test
FusionLSTMOpMD3
(
TestFusionLSTM
Op
):
def
set_
argument
(
self
):
def
set_
conf
(
self
):
self
.
M
=
15
self
.
M
=
15
self
.
D
=
3
self
.
D
=
3
class
Test
LstmOpBS1
(
TestLstm
Op
):
class
Test
FusionLSTMOpBS1
(
TestFusionLSTM
Op
):
def
set_
argument
(
self
):
def
set_
conf
(
self
):
self
.
lod
=
[[
3
]]
self
.
lod
=
[[
3
]]
self
.
D
=
16
self
.
D
=
16
...
...
python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
from
test_fusion_lstm_op
import
fc
,
ACTIVATION
def
fusion_seqexpand_concat_fc
(
xs
,
lod
,
w
,
b
,
fc_act
):
T
=
sum
(
lod
[
0
])
N
=
len
(
lod
[
0
])
num_inputs
=
len
(
xs
)
D
=
w
.
shape
[
1
]
expanded_inputs
=
[
xs
[
0
]]
for
i
in
range
(
num_inputs
-
1
):
x
=
xs
[
i
+
1
]
assert
x
.
shape
[
0
]
==
N
expanded
=
np
.
repeat
(
x
,
lod
[
0
],
axis
=
0
)
assert
expanded
.
shape
[
0
]
==
T
assert
expanded
.
shape
[
1
]
==
x
.
shape
[
1
]
expanded_inputs
.
append
(
expanded
)
fc_input
=
np
.
concatenate
(
expanded_inputs
,
axis
=
1
)
assert
fc_input
.
shape
[
0
]
==
T
assert
fc_input
.
shape
[
1
]
==
w
.
shape
[
0
]
fc_out
=
fc
(
fc_input
,
w
,
b
)
fc_out
=
fc_act
(
fc_out
)
assert
fc_out
.
shape
[
0
]
==
T
assert
fc_out
.
shape
[
1
]
==
D
return
fc_out
class
TestFusionSeqExpandConcatFCOp
(
OpTest
):
def
set_conf
(
self
):
pass
def
setUp
(
self
):
self
.
op_type
=
'fusion_seqexpand_concat_fc'
self
.
lod
=
[[
3
,
5
,
8
,
2
]]
self
.
inputs_M
=
[
15
,
10
,
10
]
self
.
D
=
20
self
.
with_bias
=
True
self
.
fc_act
=
'relu'
self
.
set_conf
()
T
=
sum
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
num_inputs
=
len
(
self
.
inputs_M
)
x0
=
np
.
random
.
normal
(
size
=
(
T
,
self
.
inputs_M
[
0
])).
astype
(
'float32'
)
xs
=
[
x0
]
for
i
in
range
(
num_inputs
-
1
):
xi
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
inputs_M
[
i
+
1
])).
astype
(
'float32'
)
xs
.
append
(
xi
)
# fc weight and bias
w
=
np
.
random
.
normal
(
size
=
(
sum
(
self
.
inputs_M
),
self
.
D
)).
astype
(
'float32'
)
b
=
np
.
random
.
normal
(
size
=
(
1
,
self
.
D
)).
astype
(
'float32'
)
if
self
.
with_bias
else
np
.
zeros
(
(
1
,
self
.
D
)).
astype
(
'float32'
)
out
=
fusion_seqexpand_concat_fc
(
xs
,
self
.
lod
,
w
,
b
,
ACTIVATION
[
self
.
fc_act
])
self
.
inputs
=
{
'X'
:
[(
'x0'
,
(
x0
,
self
.
lod
))],
'FCWeight'
:
w
}
normal_lod
=
[[
1
]
*
bs
]
for
i
in
range
(
num_inputs
-
1
):
self
.
inputs
[
'X'
].
append
((
'x%d'
%
(
i
+
1
),
(
xs
[
i
+
1
],
normal_lod
)))
if
self
.
with_bias
:
self
.
inputs
[
'FCBias'
]
=
b
self
.
outputs
=
{
'Out'
:
(
out
,
self
.
lod
)}
self
.
attrs
=
{
'fc_activation'
:
self
.
fc_act
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestFusionSECFCOpNonBias
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
with_bias
=
False
class
TestFusionSECFCOpNonAct
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
fc_act
=
'identity'
class
TestFusionSECFCOpMD1
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
inputs_M
=
[
3
,
4
,
2
,
1
,
5
]
self
.
D
=
8
class
TestFusionSECFCOpMD2
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
5
,
6
]]
self
.
inputs_M
=
[
1
,
1
]
class
TestFusionSECFCOpBS1_1
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
1
]]
self
.
inputs_M
=
[
3
,
4
,
2
]
class
TestFusionSECFCOpBS1_2
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
1
]]
self
.
inputs_M
=
[
3
,
4
]
class
TestFusionSECFCOpBS1_3
(
TestFusionSeqExpandConcatFCOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
5
]]
self
.
inputs_M
=
[
6
,
3
]
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_generate_proposals.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://w_idxw.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
import
sys
import
math
import
paddle.fluid
as
fluid
from
op_test
import
OpTest
from
test_multiclass_nms_op
import
nms
from
test_anchor_generator_op
import
anchor_generator_in_python
import
copy
def
generate_proposals_in_python
(
scores
,
bbox_deltas
,
im_info
,
anchors
,
variances
,
pre_nms_topN
,
post_nms_topN
,
nms_thresh
,
min_size
,
eta
):
all_anchors
=
anchors
.
reshape
(
-
1
,
4
)
rois
=
np
.
empty
((
0
,
5
),
dtype
=
np
.
float32
)
roi_probs
=
np
.
empty
((
0
,
1
),
dtype
=
np
.
float32
)
rpn_rois
=
[]
rpn_roi_probs
=
[]
lod
=
[]
num_images
=
scores
.
shape
[
0
]
for
img_idx
in
range
(
num_images
):
img_i_boxes
,
img_i_probs
=
proposal_for_one_image
(
im_info
[
img_idx
,
:],
all_anchors
,
variances
,
bbox_deltas
[
img_idx
,
:,
:,
:],
scores
[
img_idx
,
:,
:,
:],
pre_nms_topN
,
post_nms_topN
,
nms_thresh
,
min_size
,
eta
)
lod
.
append
(
img_i_probs
.
shape
[
0
])
rpn_rois
.
append
(
img_i_boxes
)
rpn_roi_probs
.
append
(
img_i_probs
)
return
rpn_rois
,
rpn_roi_probs
,
lod
def
proposal_for_one_image
(
im_info
,
all_anchors
,
variances
,
bbox_deltas
,
scores
,
pre_nms_topN
,
post_nms_topN
,
nms_thresh
,
min_size
,
eta
):
# Transpose and reshape predicted bbox transformations to get them
# into the same order as the anchors:
# - bbox deltas will be (4 * A, H, W) format from conv output
# - transpose to (H, W, 4 * A)
# - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
# in slowest to fastest order to match the enumerated anchors
bbox_deltas
=
bbox_deltas
.
transpose
((
1
,
2
,
0
)).
reshape
(
-
1
,
4
)
all_anchors
=
all_anchors
.
reshape
(
-
1
,
4
)
variances
=
variances
.
reshape
(
-
1
,
4
)
# Same story for the scores:
# - scores are (A, H, W) format from conv output
# - transpose to (H, W, A)
# - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
# to match the order of anchors and bbox_deltas
scores
=
scores
.
transpose
((
1
,
2
,
0
)).
reshape
(
-
1
,
1
)
# sort all (proposal, score) pairs by score from highest to lowest
# take top pre_nms_topN (e.g. 6000)
if
pre_nms_topN
<=
0
or
pre_nms_topN
>=
len
(
scores
):
order
=
np
.
argsort
(
-
scores
.
squeeze
())
else
:
# Avoid sorting possibly large arrays;
# First partition to get top K unsorted
# and then sort just thoes
inds
=
np
.
argpartition
(
-
scores
.
squeeze
(),
pre_nms_topN
)[:
pre_nms_topN
]
order
=
np
.
argsort
(
-
scores
[
inds
].
squeeze
())
order
=
inds
[
order
]
scores
=
scores
[
order
,
:]
bbox_deltas
=
bbox_deltas
[
order
,
:]
all_anchors
=
all_anchors
[
order
,
:]
proposals
=
box_coder
(
all_anchors
,
bbox_deltas
,
variances
)
# clip proposals to image (may result in proposals with zero area
# that will be removed in the next step)
proposals
=
clip_tiled_boxes
(
proposals
,
im_info
[:
2
])
# remove predicted boxes with height or width < min_size
keep
=
filter_boxes
(
proposals
,
min_size
,
im_info
)
proposals
=
proposals
[
keep
,
:]
scores
=
scores
[
keep
,
:]
# apply loose nms (e.g. threshold = 0.7)
# take post_nms_topN (e.g. 1000)
# return the top proposals
if
nms_thresh
>
0
:
keep
=
nms
(
boxes
=
proposals
,
scores
=
scores
,
nms_threshold
=
nms_thresh
,
eta
=
eta
)
if
post_nms_topN
>
0
and
post_nms_topN
<
len
(
keep
):
keep
=
keep
[:
post_nms_topN
]
proposals
=
proposals
[
keep
,
:]
scores
=
scores
[
keep
,
:]
return
proposals
,
scores
def
box_coder
(
all_anchors
,
bbox_deltas
,
variances
):
"""
Decode proposals by anchors and bbox_deltas from RPN
"""
#proposals: xmin, ymin, xmax, ymax
proposals
=
np
.
zeros_like
(
bbox_deltas
,
dtype
=
np
.
float32
)
#anchor_loc: width, height, center_x, center_y
anchor_loc
=
np
.
zeros_like
(
bbox_deltas
,
dtype
=
np
.
float32
)
anchor_loc
[:,
0
]
=
all_anchors
[:,
2
]
-
all_anchors
[:,
0
]
anchor_loc
[:,
1
]
=
all_anchors
[:,
3
]
-
all_anchors
[:,
1
]
anchor_loc
[:,
2
]
=
(
all_anchors
[:,
2
]
+
all_anchors
[:,
0
])
/
2
anchor_loc
[:,
3
]
=
(
all_anchors
[:,
3
]
+
all_anchors
[:,
1
])
/
2
#predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
pred_bbox
=
np
.
zeros_like
(
bbox_deltas
,
dtype
=
np
.
float32
)
if
variances
is
not
None
:
for
i
in
range
(
bbox_deltas
.
shape
[
0
]):
pred_bbox
[
i
,
0
]
=
variances
[
i
,
0
]
*
bbox_deltas
[
i
,
0
]
*
anchor_loc
[
i
,
0
]
+
anchor_loc
[
i
,
2
]
pred_bbox
[
i
,
1
]
=
variances
[
i
,
1
]
*
bbox_deltas
[
i
,
1
]
*
anchor_loc
[
i
,
1
]
+
anchor_loc
[
i
,
3
]
pred_bbox
[
i
,
2
]
=
math
.
exp
(
variances
[
i
,
2
]
*
bbox_deltas
[
i
,
2
])
*
anchor_loc
[
i
,
0
]
pred_bbox
[
i
,
3
]
=
math
.
exp
(
variances
[
i
,
3
]
*
bbox_deltas
[
i
,
3
])
*
anchor_loc
[
i
,
1
]
else
:
for
i
in
range
(
bbox_deltas
.
shape
[
0
]):
pred_bbox
[
i
,
0
]
=
bbox_deltas
[
i
,
0
]
*
anchor_loc
[
i
,
0
]
+
anchor_loc
[
i
,
2
]
pred_bbox
[
i
,
1
]
=
bbox_deltas
[
i
,
1
]
*
anchor_loc
[
i
,
1
]
+
anchor_loc
[
i
,
3
]
pred_bbox
[
i
,
2
]
=
math
.
exp
(
bbox_deltas
[
i
,
2
])
*
anchor_loc
[
i
,
0
]
pred_bbox
[
i
,
3
]
=
math
.
exp
(
bbox_deltas
[
i
,
3
])
*
anchor_loc
[
i
,
1
]
proposals
[:,
0
]
=
pred_bbox
[:,
0
]
-
pred_bbox
[:,
2
]
/
2
proposals
[:,
1
]
=
pred_bbox
[:,
1
]
-
pred_bbox
[:,
3
]
/
2
proposals
[:,
2
]
=
pred_bbox
[:,
0
]
+
pred_bbox
[:,
2
]
/
2
proposals
[:,
3
]
=
pred_bbox
[:,
1
]
+
pred_bbox
[:,
3
]
/
2
return
proposals
def
clip_tiled_boxes
(
boxes
,
im_shape
):
"""Clip boxes to image boundaries. im_shape is [height, width] and boxes
has shape (N, 4 * num_tiled_boxes)."""
assert
boxes
.
shape
[
1
]
%
4
==
0
,
\
'boxes.shape[1] is {:d}, but must be divisible by 4.'
.
format
(
boxes
.
shape
[
1
]
)
# x1 >= 0
boxes
[:,
0
::
4
]
=
np
.
maximum
(
np
.
minimum
(
boxes
[:,
0
::
4
],
im_shape
[
1
]
-
1
),
0
)
# y1 >= 0
boxes
[:,
1
::
4
]
=
np
.
maximum
(
np
.
minimum
(
boxes
[:,
1
::
4
],
im_shape
[
0
]
-
1
),
0
)
# x2 < im_shape[1]
boxes
[:,
2
::
4
]
=
np
.
maximum
(
np
.
minimum
(
boxes
[:,
2
::
4
],
im_shape
[
1
]
-
1
),
0
)
# y2 < im_shape[0]
boxes
[:,
3
::
4
]
=
np
.
maximum
(
np
.
minimum
(
boxes
[:,
3
::
4
],
im_shape
[
0
]
-
1
),
0
)
return
boxes
def
filter_boxes
(
boxes
,
min_size
,
im_info
):
"""Only keep boxes with both sides >= min_size and center within the image.
"""
# Scale min_size to match image scale
min_size
*=
im_info
[
2
]
ws
=
boxes
[:,
2
]
-
boxes
[:,
0
]
+
1
hs
=
boxes
[:,
3
]
-
boxes
[:,
1
]
+
1
x_ctr
=
boxes
[:,
0
]
+
ws
/
2.
y_ctr
=
boxes
[:,
1
]
+
hs
/
2.
keep
=
np
.
where
((
ws
>=
min_size
)
&
(
hs
>=
min_size
)
&
(
x_ctr
<
im_info
[
1
])
&
(
y_ctr
<
im_info
[
0
]))[
0
]
return
keep
def
iou
(
box_a
,
box_b
):
"""
Apply intersection-over-union overlap between box_a and box_b
"""
xmin_a
=
min
(
box_a
[
0
],
box_a
[
2
])
ymin_a
=
min
(
box_a
[
1
],
box_a
[
3
])
xmax_a
=
max
(
box_a
[
0
],
box_a
[
2
])
ymax_a
=
max
(
box_a
[
1
],
box_a
[
3
])
xmin_b
=
min
(
box_b
[
0
],
box_b
[
2
])
ymin_b
=
min
(
box_b
[
1
],
box_b
[
3
])
xmax_b
=
max
(
box_b
[
0
],
box_b
[
2
])
ymax_b
=
max
(
box_b
[
1
],
box_b
[
3
])
area_a
=
(
ymax_a
-
ymin_a
+
1
)
*
(
xmax_a
-
xmin_a
+
1
)
area_b
=
(
ymax_b
-
ymin_b
+
1
)
*
(
xmax_b
-
xmin_b
+
1
)
if
area_a
<=
0
and
area_b
<=
0
:
return
0.0
xa
=
max
(
xmin_a
,
xmin_b
)
ya
=
max
(
ymin_a
,
ymin_b
)
xb
=
min
(
xmax_a
,
xmax_b
)
yb
=
min
(
ymax_a
,
ymax_b
)
inter_area
=
max
(
xb
-
xa
,
0.0
)
*
max
(
yb
-
ya
,
0.0
)
iou_ratio
=
inter_area
/
(
area_a
+
area_b
-
inter_area
)
return
iou_ratio
def
nms
(
boxes
,
scores
,
nms_threshold
,
eta
=
1.0
):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
nms_threshold: (float) The overlap thresh for suppressing unnecessary
boxes.
eta: (float) The parameter for adaptive NMS.
Return:
The indices of the kept boxes with respect to num_priors.
"""
all_scores
=
copy
.
deepcopy
(
scores
)
all_scores
=
all_scores
.
flatten
()
sorted_indices
=
np
.
argsort
(
-
all_scores
,
axis
=
0
,
kind
=
'mergesort'
)
sorted_scores
=
all_scores
[
sorted_indices
]
selected_indices
=
[]
adaptive_threshold
=
nms_threshold
for
i
in
range
(
sorted_scores
.
shape
[
0
]):
idx
=
sorted_indices
[
i
]
keep
=
True
for
k
in
range
(
len
(
selected_indices
)):
if
keep
:
kept_idx
=
selected_indices
[
k
]
overlap
=
iou
(
boxes
[
idx
],
boxes
[
kept_idx
])
keep
=
True
if
overlap
<=
adaptive_threshold
else
False
else
:
break
if
keep
:
selected_indices
.
append
(
idx
)
if
keep
and
eta
<
1
and
adaptive_threshold
>
0.5
:
adaptive_threshold
*=
eta
return
selected_indices
class
TestGenerateProposalsOp
(
OpTest
):
def
set_data
(
self
):
self
.
init_test_params
()
self
.
init_test_input
()
self
.
init_test_output
()
self
.
inputs
=
{
'Scores'
:
self
.
scores
,
'BboxDeltas'
:
self
.
bbox_deltas
,
'ImInfo'
:
self
.
im_info
.
astype
(
np
.
float32
),
'Anchors'
:
self
.
anchors
,
'Variances'
:
self
.
variances
}
self
.
attrs
=
{
'pre_nms_topN'
:
self
.
pre_nms_topN
,
'post_nms_topN'
:
self
.
post_nms_topN
,
'nms_thresh'
:
self
.
nms_thresh
,
'min_size'
:
self
.
min_size
,
'eta'
:
self
.
eta
}
print
(
"lod = "
,
self
.
lod
)
self
.
outputs
=
{
'RpnRois'
:
(
self
.
rpn_rois
[
0
],
[
self
.
lod
]),
'RpnRoiProbs'
:
(
self
.
rpn_roi_probs
[
0
],
[
self
.
lod
])
}
def
test_check_output
(
self
):
self
.
check_output
()
def
setUp
(
self
):
self
.
op_type
=
"generate_proposals"
self
.
set_data
()
def
init_test_params
(
self
):
self
.
pre_nms_topN
=
12000
# train 12000, test 2000
self
.
post_nms_topN
=
5000
# train 6000, test 1000
self
.
nms_thresh
=
0.7
self
.
min_size
=
3.0
self
.
eta
=
0.8
def
init_test_input
(
self
):
batch_size
=
1
input_channels
=
20
layer_h
=
16
layer_w
=
16
input_feat
=
np
.
random
.
random
(
(
batch_size
,
input_channels
,
layer_h
,
layer_w
)).
astype
(
'float32'
)
self
.
anchors
,
self
.
variances
=
anchor_generator_in_python
(
input_feat
=
input_feat
,
anchor_sizes
=
[
16.
,
32.
],
aspect_ratios
=
[
0.5
,
1.0
],
variances
=
[
1.0
,
1.0
,
1.0
,
1.0
],
stride
=
[
16.0
,
16.0
],
offset
=
0.5
)
self
.
im_info
=
np
.
array
([[
64.
,
64.
,
8.
]])
#im_height, im_width, scale
num_anchors
=
self
.
anchors
.
shape
[
2
]
self
.
scores
=
np
.
random
.
random
(
(
batch_size
,
num_anchors
,
layer_h
,
layer_w
)).
astype
(
'float32'
)
self
.
bbox_deltas
=
np
.
random
.
random
(
(
batch_size
,
num_anchors
*
4
,
layer_h
,
layer_w
)).
astype
(
'float32'
)
def
init_test_output
(
self
):
self
.
rpn_rois
,
self
.
rpn_roi_probs
,
self
.
lod
=
generate_proposals_in_python
(
self
.
scores
,
self
.
bbox_deltas
,
self
.
im_info
,
self
.
anchors
,
self
.
variances
,
self
.
pre_nms_topN
,
self
.
post_nms_topN
,
self
.
nms_thresh
,
self
.
min_size
,
self
.
eta
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_gru_op.py
浏览文件 @
b98b7440
...
@@ -19,22 +19,19 @@ import numpy as np
...
@@ -19,22 +19,19 @@ import numpy as np
import
math
import
math
import
functools
import
functools
from
op_test
import
OpTest
from
op_test
import
OpTest
from
test_lstm_op
import
identity
,
sigmoid
,
tanh
,
relu
from
test_lstm_op
import
ACTIVATION
class
TestGRUOp
(
OpTest
):
def
gru
(
lod
=
[[
2
,
4
,
3
]]
input
,
# T x 3D
batch_size
=
sum
(
lod
[
0
])
lod
,
# 1 x N
frame_size
=
5
h0
,
# N x D
activate
=
{
weight
,
# D x 3D
'identity'
:
identity
,
bias
,
# 1 x 3D
'sigmoid'
:
sigmoid
,
is_reverse
,
'tanh'
:
tanh
,
act_state
,
'relu'
:
relu
act_gate
):
}
def
_seq_to_batch
(
lod
,
is_reverse
):
@
staticmethod
def
seq_to_batch
(
lod
,
is_reverse
):
idx_in_seq_list
=
[]
idx_in_seq_list
=
[]
seq_lens
=
lod
[
0
]
seq_lens
=
lod
[
0
]
seq_starts
=
[
0
]
seq_starts
=
[
0
]
...
@@ -56,44 +53,38 @@ class TestGRUOp(OpTest):
...
@@ -56,44 +53,38 @@ class TestGRUOp(OpTest):
idx_in_seq_list
.
append
(
idx_in_seq
)
idx_in_seq_list
.
append
(
idx_in_seq
)
return
idx_in_seq_list
,
sorted_seqs
return
idx_in_seq_list
,
sorted_seqs
def
gru_step
(
self
,
x
,
h_p
,
w
,
b
):
def
_step
(
x
,
h_p
,
w
,
b
,
act_state
,
act_gate
):
batch_size
=
x
.
shape
[
0
]
T
=
x
.
shape
[
0
]
frame_size
=
w
.
shape
[
0
]
D
=
w
.
shape
[
0
]
g
=
x
+
np
.
tile
(
b
,
(
batch_size
,
1
))
g
=
x
+
np
.
tile
(
b
,
(
T
,
1
))
w_u_r
=
w
.
flatten
()[:
frame_size
*
frame_size
*
2
].
reshape
(
w_u_r
=
w
.
flatten
()[:
D
*
D
*
2
].
reshape
((
D
,
D
*
2
))
(
frame_size
,
frame_size
*
2
))
u_r
=
act_gate
(
np
.
dot
(
h_p
,
w_u_r
)
+
g
[:,
:
D
*
2
])
u_r
=
self
.
activate
[
self
.
attrs
[
'gate_activation'
]](
np
.
dot
(
u
=
u_r
[:,
:
D
]
h_p
,
w_u_r
)
+
g
[:,
:
frame_size
*
2
])
r
=
u_r
[:,
D
:
D
*
2
]
u
=
u_r
[:,
:
frame_size
]
r
=
u_r
[:,
frame_size
:
frame_size
*
2
]
r_h_p
=
r
*
h_p
r_h_p
=
r
*
h_p
w_c
=
w
.
flatten
()[
frame_size
*
frame_size
*
2
:].
reshape
(
w_c
=
w
.
flatten
()[
D
*
D
*
2
:].
reshape
((
D
,
D
))
(
frame_size
,
frame_size
))
c
=
act_state
(
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
D
*
2
:])
c
=
self
.
activate
[
self
.
attrs
[
'activation'
]](
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
frame_size
*
2
:])
g
=
np
.
hstack
((
u_r
,
c
))
g
=
np
.
hstack
((
u_r
,
c
))
h
=
u
*
c
+
(
1
-
u
)
*
h_p
h
=
u
*
c
+
(
1
-
u
)
*
h_p
return
g
,
r_h_p
,
h
return
g
,
r_h_p
,
h
def
gru
(
self
):
T
=
sum
(
lod
[
0
])
input
,
lod
=
self
.
inputs
[
'Input'
]
N
=
len
(
lod
[
0
])
w
=
self
.
inputs
[
'Weight'
]
D
=
weight
.
shape
[
0
]
b
=
self
.
inputs
[
'Bias'
]
if
'Bias'
in
self
.
inputs
else
np
.
zeros
(
batch_gate
=
np
.
zeros
((
T
,
3
*
D
),
dtype
=
'float64'
)
(
1
,
self
.
frame_size
*
3
))
batch_reset_hidden_prev
=
np
.
zeros
((
T
,
D
),
dtype
=
'float64'
)
batch_gate
=
self
.
outputs
[
'BatchGate'
]
batch_hidden
=
np
.
zeros
((
T
,
D
),
dtype
=
'float64'
)
batch_reset_hidden_prev
=
self
.
outputs
[
'BatchResetHiddenPrev'
]
hidden
=
np
.
zeros
((
T
,
D
),
dtype
=
'float64'
)
batch_hidden
=
self
.
outputs
[
'BatchHidden'
]
hidden
=
self
.
outputs
[
'Hidden'
]
idx_in_seq_list
,
sorted_seqs
=
_seq_to_batch
(
lod
,
is_reverse
)
idx_in_seq_list
=
self
.
idx_in_seq_list
h_p
=
h0
[
sorted_seqs
]
h_p
=
self
.
inputs
[
'H0'
][
max_seq_len
=
len
(
idx_in_seq_list
)
self
.
sorted_seqs
]
if
'H0'
in
self
.
inputs
else
np
.
zeros
(
assert
len
(
idx_in_seq_list
[
0
])
==
N
(
len
(
idx_in_seq_list
[
0
]),
self
.
frame_size
))
num_batch
=
len
(
idx_in_seq_list
)
end_idx
=
0
end_idx
=
0
for
batch_idx
in
range
(
num_batch
):
for
batch_idx
in
range
(
max_seq_len
):
x
=
input
[
idx_in_seq_list
[
batch_idx
]]
x
=
input
[
idx_in_seq_list
[
batch_idx
]]
g
,
r_h_p
,
h
=
self
.
gru_step
(
x
,
h_p
,
w
,
b
)
g
,
r_h_p
,
h
=
_step
(
x
,
h_p
,
weight
,
bias
,
act_state
,
act_gate
)
if
batch_idx
<
(
num_batch
-
1
):
if
batch_idx
<
(
max_seq_len
-
1
):
h_p
=
h
[:
len
(
idx_in_seq_list
[
batch_idx
+
1
])]
h_p
=
h
[:
len
(
idx_in_seq_list
[
batch_idx
+
1
])]
start_idx
=
end_idx
start_idx
=
end_idx
end_idx
=
start_idx
+
len
(
idx_in_seq_list
[
batch_idx
])
end_idx
=
start_idx
+
len
(
idx_in_seq_list
[
batch_idx
])
...
@@ -101,76 +92,86 @@ class TestGRUOp(OpTest):
...
@@ -101,76 +92,86 @@ class TestGRUOp(OpTest):
batch_reset_hidden_prev
[
start_idx
:
end_idx
]
=
r_h_p
batch_reset_hidden_prev
[
start_idx
:
end_idx
]
=
r_h_p
batch_hidden
[
start_idx
:
end_idx
]
=
h
batch_hidden
[
start_idx
:
end_idx
]
=
h
hidden
[
idx_in_seq_list
[
batch_idx
]]
=
h
hidden
[
idx_in_seq_list
[
batch_idx
]]
=
h
return
batch_gate
,
batch_reset_hidden_prev
,
hidden
return
batch_gate
,
batch_reset_hidden_prev
,
batch_hidden
,
hidden
def
set_data
(
self
):
lod
=
self
.
lod
self
.
idx_in_seq_list
,
self
.
sorted_seqs
=
self
.
seq_to_batch
(
lod
,
self
.
is_reverse
)
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
input
=
np
.
random
.
rand
(
batch_size
,
frame_size
*
3
).
astype
(
'float64'
)
h0
=
np
.
random
.
rand
(
len
(
self
.
idx_in_seq_list
[
0
]),
frame_size
).
astype
(
'float64'
)
weight
=
np
.
random
.
rand
(
frame_size
,
frame_size
*
3
).
astype
(
'float64'
)
bias
=
np
.
random
.
rand
(
1
,
frame_size
*
3
).
astype
(
'float64'
)
self
.
inputs
=
{
'Input'
:
(
input
,
lod
),
'H0'
:
h0
,
'Weight'
:
weight
,
'Bias'
:
bias
}
self
.
outputs
=
{
'BatchGate'
:
np
.
zeros
(
(
batch_size
,
frame_size
*
3
),
dtype
=
'float64'
),
'BatchResetHiddenPrev'
:
np
.
zeros
(
(
batch_size
,
frame_size
),
dtype
=
'float64'
),
'BatchHidden'
:
np
.
zeros
(
(
batch_size
,
frame_size
),
dtype
=
'float64'
),
'Hidden'
:
np
.
zeros
(
(
batch_size
,
frame_size
),
dtype
=
'float64'
)
}
class
TestGRUOp
(
OpTest
):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
is_reverse
=
False
pass
self
.
attrs
=
{
'activation'
:
'tanh'
,
'gate_activation'
:
'sigmoid'
,
'is_reverse'
:
self
.
is_reverse
}
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"gru"
self
.
op_type
=
"gru"
self
.
lod
=
[[
2
,
4
,
3
]]
self
.
D
=
5
self
.
is_reverse
=
False
self
.
with_h0
=
True
self
.
with_bias
=
True
self
.
act_state
=
'tanh'
self
.
act_gate
=
'sigmoid'
self
.
set_confs
()
self
.
set_confs
()
self
.
set_data
()
self
.
gru
()
T
=
sum
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
input
=
np
.
random
.
rand
(
T
,
3
*
self
.
D
).
astype
(
'float64'
)
weight
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float64'
)
bias
=
np
.
random
.
rand
(
1
,
3
*
self
.
D
).
astype
(
'float64'
)
if
self
.
with_bias
else
np
.
zeros
(
(
1
,
3
*
self
.
D
),
dtype
=
'float64'
)
h0
=
np
.
random
.
rand
(
N
,
self
.
D
).
astype
(
'float64'
)
if
self
.
with_h0
else
np
.
zeros
(
(
N
,
self
.
D
),
dtype
=
'float64'
)
batch_gate
,
batch_reset_hidden_prev
,
batch_hidden
,
hidden
=
gru
(
input
,
self
.
lod
,
h0
,
weight
,
bias
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
])
self
.
inputs
=
{
'Input'
:
(
input
,
self
.
lod
),
'Weight'
:
weight
}
if
self
.
with_bias
:
self
.
inputs
[
'Bias'
]
=
bias
if
self
.
with_h0
:
self
.
inputs
[
'H0'
]
=
h0
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
),
'BatchGate'
:
batch_gate
,
'BatchResetHiddenPrev'
:
batch_reset_hidden_prev
,
'BatchHidden'
:
batch_hidden
,
}
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'gate_activation'
:
self
.
act_gate
,
'is_reverse'
:
self
.
is_reverse
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
(
atol
=
1e-8
)
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
class
TestGRUOpNoInitial
(
TestGRUOp
):
class
TestGRUOpNoInitial
(
TestGRUOp
):
def
set_data
(
self
):
def
set_confs
(
self
):
super
(
TestGRUOpNoInitial
,
self
).
set_data
()
self
.
with_h0
=
False
self
.
inputs
.
pop
(
'H0'
)
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
([
'Input'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
self
.
check_grad
([
'Input'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
class
TestGRUOpNoBias
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
with_bias
=
False
def
test_check_grad
(
self
):
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
],
[
'Hidden'
])
class
TestGRUOpReverse
(
TestGRUOp
):
class
TestGRUOpReverse
(
TestGRUOp
):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
self
.
is_reverse
=
True
self
.
attrs
=
{
'activation'
:
'tanh'
,
'gate_activation'
:
'sigmoid'
,
'is_reverse'
:
self
.
is_reverse
}
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_pad_constant_like.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestPadOp
(
OpTest
):
def
setUp
(
self
):
self
.
initTestCase
()
self
.
op_type
=
"pad_constant_like"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
self
.
x_shape
).
astype
(
"float32"
),
'Y'
:
np
.
random
.
random
(
self
.
y_shape
).
astype
(
"float32"
)
}
self
.
attrs
=
{}
self
.
attrs
[
'pad_value'
]
=
self
.
pad_value
self
.
outputs
=
{
'Out'
:
np
.
pad
(
self
.
inputs
[
'Y'
],
self
.
paddings
,
mode
=
'constant'
,
constant_values
=
self
.
pad_value
)
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad_normal
(
self
):
self
.
check_grad
([
'Y'
],
'Out'
,
max_relative_error
=
0.006
)
def
initTestCase
(
self
):
self
.
x_shape
=
(
16
,
16
)
self
.
y_shape
=
(
3
,
16
)
self
.
pad_value
=
0.1
self
.
paddings
=
[(
0
,
13
),
(
0
,
0
)]
class
TestCase1
(
TestPadOp
):
def
initTestCase
(
self
):
self
.
x_shape
=
(
4
,
3
,
4
,
4
)
self
.
y_shape
=
(
2
,
3
,
4
,
4
)
self
.
paddings
=
[(
0
,
2
),
(
0
,
0
),
(
0
,
0
),
(
0
,
0
)]
self
.
pad_value
=
0.5
class
TestCase2
(
TestPadOp
):
def
initTestCase
(
self
):
self
.
x_shape
=
(
4
,
3
,
4
,
4
)
self
.
y_shape
=
(
2
,
3
,
2
,
4
)
self
.
paddings
=
[(
0
,
2
),
(
0
,
0
),
(
0
,
2
),
(
0
,
0
)]
self
.
pad_value
=
0.5
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_print_op.py
浏览文件 @
b98b7440
...
@@ -35,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase):
...
@@ -35,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase):
def
build_network
(
self
,
only_forward
,
**
kargs
):
def
build_network
(
self
,
only_forward
,
**
kargs
):
x
=
layers
.
data
(
'x'
,
shape
=
[
3
],
dtype
=
'float32'
,
lod_level
=
1
)
x
=
layers
.
data
(
'x'
,
shape
=
[
3
],
dtype
=
'float32'
,
lod_level
=
1
)
x
.
stop_gradient
=
False
x
.
stop_gradient
=
False
printed
=
layers
.
Print
(
input
=
x
,
**
kargs
)
layers
.
Print
(
input
=
x
,
**
kargs
)
if
only_forward
:
return
printed
loss
=
layers
.
mean
(
x
)
loss
=
layers
.
mean
(
printed
)
append_backward
(
loss
=
loss
)
append_backward
(
loss
=
loss
)
return
loss
return
loss
...
...
python/paddle/fluid/tests/unittests/test_scale_op.py
浏览文件 @
b98b7440
...
@@ -17,6 +17,8 @@ from __future__ import print_function
...
@@ -17,6 +17,8 @@ from __future__ import print_function
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
from
op_test
import
OpTest
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
class
TestScaleOp
(
OpTest
):
class
TestScaleOp
(
OpTest
):
...
@@ -33,5 +35,57 @@ class TestScaleOp(OpTest):
...
@@ -33,5 +35,57 @@ class TestScaleOp(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
self
.
check_grad
([
'X'
],
'Out'
)
class
TestScaleOpSelectedRows
(
unittest
.
TestCase
):
def
check_with_place
(
self
,
place
,
in_name
,
out_name
):
scope
=
core
.
Scope
()
# create and initialize Grad Variable
in_height
=
10
in_rows
=
[
0
,
4
,
7
]
in_row_numel
=
12
scale
=
2.0
in_selected_rows
=
scope
.
var
(
in_name
).
get_selected_rows
()
in_selected_rows
.
set_height
(
in_height
)
in_selected_rows
.
set_rows
(
in_rows
)
in_array
=
np
.
random
.
random
(
(
len
(
in_rows
),
in_row_numel
)).
astype
(
"float32"
)
in_tensor
=
in_selected_rows
.
get_tensor
()
in_tensor
.
set
(
in_array
,
place
)
# create and initialize Param Variable
out_selected_rows
=
scope
.
var
(
out_name
).
get_selected_rows
()
out_tensor
=
out_selected_rows
.
get_tensor
()
out_tensor
.
_set_dims
(
in_tensor
.
_get_dims
())
# create and run sgd operator
scale_op
=
Operator
(
"scale"
,
X
=
in_name
,
Out
=
out_name
,
scale
=
scale
)
scale_op
.
run
(
scope
,
place
)
# get and compare result
out_height
=
out_selected_rows
.
height
()
out_rows
=
out_selected_rows
.
rows
()
result_array
=
np
.
array
(
out_tensor
)
assert
(
in_array
*
scale
==
result_array
).
all
()
assert
in_height
==
out_height
assert
in_rows
==
out_rows
def
test_scale_selected_rows
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
,
'in'
,
'out'
)
def
test_scale_selected_rows_inplace
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
,
'in'
,
'in'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestSequencePadOp
(
OpTest
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
4
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
]
self
.
padded_length
=
-
1
self
.
dtype
=
'float32'
def
set_data
(
self
):
x_data
=
np
.
random
.
uniform
(
0.1
,
0.5
,
self
.
x_shape
).
astype
(
self
.
dtype
)
pad_value_data
=
np
.
array
(
self
.
pad_value
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
(
x_data
,
self
.
x_len_lod
),
'PadValue'
:
pad_value_data
}
self
.
attrs
=
{
'padded_length'
:
self
.
padded_length
}
def
compute
(
self
):
# get padded length
padded_length
=
self
.
padded_length
x_len_lod_0
=
self
.
x_len_lod
[
0
]
if
padded_length
==
-
1
:
max_seq_len
=
0
for
l
in
x_len_lod_0
:
max_seq_len
=
max
(
max_seq_len
,
l
)
padded_length
=
max_seq_len
# do padding
x_data
=
self
.
inputs
[
'X'
][
0
]
pad_value_data
=
self
.
inputs
[
'PadValue'
]
if
pad_value_data
.
shape
==
(
1
,
):
pad_value_data
=
np
.
broadcast_to
(
pad_value_data
,
shape
=
x_data
.
shape
[
1
:])
padded_sequences
=
[]
start_idx
=
0
for
l
in
x_len_lod_0
:
end_idx
=
start_idx
+
l
seq
=
x_data
[
start_idx
:
end_idx
]
to_pad_len
=
padded_length
-
l
for
_
in
range
(
to_pad_len
):
seq
=
np
.
append
(
seq
,
pad_value_data
[
np
.
newaxis
,
:],
axis
=
0
)
padded_sequences
.
append
(
seq
)
start_idx
=
end_idx
out_data
=
np
.
array
(
padded_sequences
)
self
.
outputs
=
{
'Out'
:
out_data
}
def
setUp
(
self
):
self
.
op_type
=
'sequence_pad'
self
.
set_attr
()
self
.
set_data
()
self
.
compute
()
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Out"
)
class
TestSequencePadOp2
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
4
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
,
2.0
,
3.0
,
4.0
]
self
.
padded_length
=
-
1
self
.
dtype
=
'float32'
class
TestSequencePadOp3
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
4
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
]
self
.
padded_length
=
7
self
.
dtype
=
'float32'
class
TestSequencePadOp4
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
4
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
,
2.0
,
3.0
,
4.0
]
self
.
padded_length
=
7
self
.
dtype
=
'float32'
class
TestSequencePadOp5
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
2
,
2
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
]
self
.
padded_length
=
-
1
self
.
dtype
=
'float32'
class
TestSequencePadOp6
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
2
,
2
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[[
1.0
,
2.0
],
[
3.0
,
4.0
]]
self
.
padded_length
=
-
1
self
.
dtype
=
'float32'
class
TestSequencePadOp7
(
TestSequencePadOp
):
def
set_attr
(
self
):
self
.
x_shape
=
[
12
,
2
,
2
]
self
.
x_len_lod
=
[[
2
,
3
,
4
,
3
]]
self
.
pad_value
=
[
1.0
]
self
.
padded_length
=
7
self
.
dtype
=
'float32'
python/paddle/fluid/tests/unittests/test_tensor.py
浏览文件 @
b98b7440
...
@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase):
...
@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase):
self
.
assertAlmostEqual
(
1.0
,
tensor_array_2
[
3
,
9
])
self
.
assertAlmostEqual
(
1.0
,
tensor_array_2
[
3
,
9
])
self
.
assertAlmostEqual
(
2.0
,
tensor_array_2
[
19
,
11
])
self
.
assertAlmostEqual
(
2.0
,
tensor_array_2
[
19
,
11
])
def
test_int8_tensor
(
self
):
scope
=
core
.
Scope
()
var
=
scope
.
var
(
"int8_tensor"
)
cpu_tensor
=
var
.
get_tensor
()
tensor_array
=
numpy
.
random
.
randint
(
-
127
,
high
=
128
,
size
=
[
100
,
200
],
dtype
=
numpy
.
int8
)
place
=
core
.
CPUPlace
()
cpu_tensor
.
set
(
tensor_array
,
place
)
cpu_tensor_array_2
=
numpy
.
array
(
cpu_tensor
)
self
.
assertAlmostEqual
(
cpu_tensor_array_2
.
all
(),
tensor_array
.
all
())
if
core
.
is_compiled_with_cuda
():
cuda_tensor
=
var
.
get_tensor
()
tensor_array
=
numpy
.
random
.
randint
(
-
127
,
high
=
128
,
size
=
[
100
,
200
],
dtype
=
numpy
.
int8
)
place
=
core
.
CUDAPlace
(
0
)
cuda_tensor
.
set
(
tensor_array
,
place
)
cuda_tensor_array_2
=
numpy
.
array
(
cuda_tensor
)
self
.
assertAlmostEqual
(
cuda_tensor_array_2
.
all
(),
tensor_array
.
all
())
def
test_int_lod_tensor
(
self
):
def
test_int_lod_tensor
(
self
):
place
=
core
.
CPUPlace
()
place
=
core
.
CPUPlace
()
scope
=
core
.
Scope
()
scope
=
core
.
Scope
()
...
...
python/paddle/fluid/tests/unittests/test_unstack_op.py
0 → 100644
浏览文件 @
b98b7440
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
op_test
import
OpTest
import
numpy
as
np
import
unittest
class
TestUnStackOpBase
(
OpTest
):
def
initDefaultParameters
(
self
):
self
.
input_dim
=
(
5
,
6
,
7
)
self
.
axis
=
0
self
.
dtype
=
'float32'
def
initParameters
(
self
):
pass
def
get_y_names
(
self
):
y_names
=
[]
for
i
in
range
(
self
.
input_dim
[
self
.
axis
]):
y_names
.
append
(
'y{}'
.
format
(
i
))
return
y_names
def
setUp
(
self
):
self
.
initDefaultParameters
()
self
.
initParameters
()
self
.
op_type
=
'unstack'
self
.
x
=
np
.
random
.
random
(
size
=
self
.
input_dim
).
astype
(
self
.
dtype
)
outs
=
np
.
split
(
self
.
x
,
self
.
input_dim
[
self
.
axis
],
self
.
axis
)
new_shape
=
list
(
self
.
input_dim
)
del
new_shape
[
self
.
axis
]
y_names
=
self
.
get_y_names
()
tmp
=
[]
for
i
in
range
(
self
.
input_dim
[
self
.
axis
]):
tmp
.
append
((
y_names
[
i
],
np
.
reshape
(
outs
[
i
],
new_shape
)))
self
.
inputs
=
{
'X'
:
self
.
x
}
self
.
outputs
=
{
'Y'
:
tmp
}
self
.
attrs
=
{
'axis'
:
self
.
axis
,
'num'
:
self
.
input_dim
[
self
.
axis
]}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
(
'X'
,
self
.
get_y_names
())
class
TestStackOp3
(
TestUnStackOpBase
):
def
initParameters
(
self
):
self
.
axis
=
-
1
class
TestStackOp4
(
TestUnStackOpBase
):
def
initParameters
(
self
):
self
.
axis
=
-
3
class
TestStackOp5
(
TestUnStackOpBase
):
def
initParameters
(
self
):
self
.
axis
=
1
class
TestStackOp6
(
TestUnStackOpBase
):
def
initParameters
(
self
):
self
.
axis
=
2
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_variable.py
浏览文件 @
b98b7440
...
@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase):
...
@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase):
self
.
assertEqual
(
DT
.
INT16
,
convert
(
"int16"
))
self
.
assertEqual
(
DT
.
INT16
,
convert
(
"int16"
))
self
.
assertEqual
(
DT
.
INT64
,
convert
(
"int64"
))
self
.
assertEqual
(
DT
.
INT64
,
convert
(
"int64"
))
self
.
assertEqual
(
DT
.
BOOL
,
convert
(
"bool"
))
self
.
assertEqual
(
DT
.
BOOL
,
convert
(
"bool"
))
self
.
assertRaises
(
ValueError
,
lambda
:
convert
(
"int8"
))
self
.
assertEqual
(
DT
.
INT8
,
convert
(
"int8"
))
self
.
assertEqual
(
DT
.
UINT8
,
convert
(
"uint8"
))
def
test_var
(
self
):
def
test_var
(
self
):
b
=
default_main_program
().
current_block
()
b
=
default_main_program
().
current_block
()
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
b98b7440
...
@@ -31,7 +31,6 @@ Steps to transpile pserver:
...
@@ -31,7 +31,6 @@ Steps to transpile pserver:
"""
"""
import
math
import
math
import
random
import
numpy
as
np
import
numpy
as
np
import
collections
import
collections
import
six
import
six
...
@@ -239,8 +238,8 @@ class DistributeTranspiler(object):
...
@@ -239,8 +238,8 @@ class DistributeTranspiler(object):
grad_var_mapping_items
=
list
(
six
.
iteritems
(
self
.
grad_var_mapping
))
grad_var_mapping_items
=
list
(
six
.
iteritems
(
self
.
grad_var_mapping
))
if
not
self
.
config
.
slice_var_up
:
if
not
self
.
config
.
slice_var_up
:
random
.
seed
(
self
.
origin_program
.
random_seed
)
np
.
random
.
seed
(
self
.
origin_program
.
random_seed
)
random
.
shuffle
(
grad_var_mapping_items
)
np
.
random
.
shuffle
(
grad_var_mapping_items
)
grad_name_to_send_dummy_out
=
dict
()
grad_name_to_send_dummy_out
=
dict
()
for
grad_varname
,
splited_vars
in
grad_var_mapping_items
:
for
grad_varname
,
splited_vars
in
grad_var_mapping_items
:
...
@@ -284,10 +283,13 @@ class DistributeTranspiler(object):
...
@@ -284,10 +283,13 @@ class DistributeTranspiler(object):
send_vars
.
append
(
var
)
send_vars
.
append
(
var
)
if
self
.
sync_mode
:
if
self
.
sync_mode
:
send_barrier_out
=
program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
input_deps
=
grad_name_to_send_dummy_out
.
values
()
program
.
global_block
().
append_op
(
program
.
global_block
().
append_op
(
type
=
"send_barrier"
,
type
=
"send_barrier"
,
inputs
=
{},
inputs
=
{
"X"
:
input_deps
},
outputs
=
{},
outputs
=
{
"Out"
:
send_barrier_out
},
attrs
=
{
attrs
=
{
"endpoints"
:
pserver_endpoints
,
"endpoints"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
@@ -305,16 +307,22 @@ class DistributeTranspiler(object):
...
@@ -305,16 +307,22 @@ class DistributeTranspiler(object):
self
.
param_grad_ep_mapping
[
ep
][
"grads"
].
append
(
send_vars
[
i
])
self
.
param_grad_ep_mapping
[
ep
][
"grads"
].
append
(
send_vars
[
i
])
# step4: Concat the parameters splits together after recv.
# step4: Concat the parameters splits together after recv.
all_recv_outputs
=
[]
for
param_varname
,
splited_var
in
six
.
iteritems
(
self
.
param_var_mapping
):
for
param_varname
,
splited_var
in
six
.
iteritems
(
self
.
param_var_mapping
):
eps
=
[]
eps
=
[]
for
var
in
splited_var
:
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
eps
.
append
(
eplist
[
index
])
grad_send_dummy_out
=
grad_name_to_send_dummy_out
[
if
self
.
sync_mode
:
recv_dep_in
=
send_barrier_out
else
:
# connect deps to send op in async mode
recv_dep_in
=
grad_name_to_send_dummy_out
[
self
.
param_name_to_grad_name
[
param_varname
]]
self
.
param_name_to_grad_name
[
param_varname
]]
all_recv_outputs
.
extend
(
splited_var
)
program
.
global_block
().
append_op
(
program
.
global_block
().
append_op
(
type
=
"recv"
,
type
=
"recv"
,
inputs
=
{
"X"
:
[
grad_send_dummy_out
]},
inputs
=
{
"X"
:
[
recv_dep_in
]},
outputs
=
{
"Out"
:
splited_var
},
outputs
=
{
"Out"
:
splited_var
},
attrs
=
{
attrs
=
{
"epmap"
:
eps
,
"epmap"
:
eps
,
...
@@ -327,10 +335,11 @@ class DistributeTranspiler(object):
...
@@ -327,10 +335,11 @@ class DistributeTranspiler(object):
})
})
if
self
.
sync_mode
:
if
self
.
sync_mode
:
# form a WAW dependency
program
.
global_block
().
append_op
(
program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
type
=
"fetch_barrier"
,
inputs
=
{},
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
all_recv_outputs
},
attrs
=
{
attrs
=
{
"endpoints"
:
pserver_endpoints
,
"endpoints"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
@@ -414,10 +423,12 @@ class DistributeTranspiler(object):
...
@@ -414,10 +423,12 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
})
fetch_barrier_out
=
startup_program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
startup_program
.
global_block
().
append_op
(
startup_program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
type
=
"fetch_barrier"
,
inputs
=
{},
inputs
=
{},
outputs
=
{},
outputs
=
{
"Out"
:
fetch_barrier_out
},
attrs
=
{
attrs
=
{
"endpoints"
:
self
.
pserver_endpoints
,
"endpoints"
:
self
.
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录