Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a58dd3e5
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a58dd3e5
编写于
8月 04, 2018
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into port_python3_syntax
上级
850c3945
0964de11
变更
104
隐藏空白更改
内联
并排
Showing
104 changed file
with
2720 addition
and
710 deletion
+2720
-710
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+4
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-0
paddle/fluid/framework/data_type.cc
paddle/fluid/framework/data_type.cc
+3
-1
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+40
-0
paddle/fluid/framework/details/exception_holder.h
paddle/fluid/framework/details/exception_holder.h
+83
-0
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
...uid/framework/details/scope_buffered_ssa_graph_executor.h
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+5
-22
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+3
-3
paddle/fluid/framework/ir/graph_helper_test.cc
paddle/fluid/framework/ir/graph_helper_test.cc
+2
-2
paddle/fluid/framework/ir/graph_test.cc
paddle/fluid/framework/ir/graph_test.cc
+6
-6
paddle/fluid/framework/op_kernel_type_test.cc
paddle/fluid/framework/op_kernel_type_test.cc
+7
-0
paddle/fluid/framework/op_proto_maker.cc
paddle/fluid/framework/op_proto_maker.cc
+34
-0
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+2
-0
paddle/fluid/framework/op_proto_maker_test.cc
paddle/fluid/framework/op_proto_maker_test.cc
+102
-5
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+33
-2
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+1
-1
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+15
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+3
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+7
-0
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+13
-17
paddle/fluid/inference/analysis/analyzer_main.cc
paddle/fluid/inference/analysis/analyzer_main.cc
+33
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+6
-2
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+14
-0
paddle/fluid/inference/analysis/data_flow_graph.h
paddle/fluid/inference/analysis/data_flow_graph.h
+3
-1
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+2
-2
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
...nference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+5
-5
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
...fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+9
-3
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
...fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+30
-6
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
.../fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+1
-1
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
...nference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+2
-1
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+15
-0
paddle/fluid/inference/analysis/model_store_pass.cc
paddle/fluid/inference/analysis/model_store_pass.cc
+63
-0
paddle/fluid/inference/analysis/model_store_pass.h
paddle/fluid/inference/analysis/model_store_pass.h
+53
-0
paddle/fluid/inference/analysis/model_store_pass_tester.cc
paddle/fluid/inference/analysis/model_store_pass_tester.cc
+43
-0
paddle/fluid/inference/analysis/pass.h
paddle/fluid/inference/analysis/pass.h
+1
-0
paddle/fluid/inference/analysis/pass_manager_tester.cc
paddle/fluid/inference/analysis/pass_manager_tester.cc
+5
-2
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+4
-4
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
...rence/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+3
-3
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
...fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+3
-4
paddle/fluid/inference/analysis/ut_helper.h
paddle/fluid/inference/analysis/ut_helper.h
+3
-18
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+4
-1
paddle/fluid/inference/api/api_anakin_engine.cc
paddle/fluid/inference/api/api_anakin_engine.cc
+71
-20
paddle/fluid/inference/api/api_anakin_engine.h
paddle/fluid/inference/api/api_anakin_engine.h
+12
-8
paddle/fluid/inference/api/api_anakin_engine_tester.cc
paddle/fluid/inference/api/api_anakin_engine_tester.cc
+17
-16
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+11
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+12
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+4
-5
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+9
-8
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+6
-6
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+4
-2
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+6
-2
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+50
-1
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+210
-0
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+25
-0
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+57
-0
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
...e/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+73
-0
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+33
-3
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+1
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+4
-4
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+12
-12
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+25
-22
paddle/fluid/operators/elementwise_add_op.cc
paddle/fluid/operators/elementwise_add_op.cc
+3
-1
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+9
-7
paddle/fluid/operators/elementwise_div_op.cc
paddle/fluid/operators/elementwise_div_op.cc
+2
-0
paddle/fluid/operators/elementwise_op.h
paddle/fluid/operators/elementwise_op.h
+71
-3
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+122
-64
paddle/fluid/operators/elementwise_sub_op.cc
paddle/fluid/operators/elementwise_sub_op.cc
+4
-1
paddle/fluid/operators/elementwise_sub_op.h
paddle/fluid/operators/elementwise_sub_op.h
+6
-5
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+25
-0
paddle/fluid/operators/math/im2col.cc
paddle/fluid/operators/math/im2col.cc
+10
-52
paddle/fluid/operators/math/im2col_cfo_cpu.h
paddle/fluid/operators/math/im2col_cfo_cpu.h
+252
-0
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+103
-72
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+5
-26
paddle/fluid/operators/softmax_op.cc
paddle/fluid/operators/softmax_op.cc
+23
-4
paddle/fluid/operators/split_ids_op.h
paddle/fluid/operators/split_ids_op.h
+11
-4
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+109
-74
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+10
-10
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-1
python/paddle/batch.py
python/paddle/batch.py
+6
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+1
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+4
-5
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+0
-2
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+280
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+61
-11
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+137
-0
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+5
-118
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+27
-0
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+2
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+9
-0
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
...id/tests/unittests/test_memory_optimization_transpiler.py
+24
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-3
python/paddle/fluid/tests/unittests/test_reshape_op.py
python/paddle/fluid/tests/unittests/test_reshape_op.py
+3
-3
python/paddle/fluid/tests/unittests/test_split_ids_op.py
python/paddle/fluid/tests/unittests/test_split_ids_op.py
+52
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+17
-9
python/paddle/fluid/tests/unittests/transformer_model.py
python/paddle/fluid/tests/unittests/transformer_model.py
+1
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+1
-0
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
a58dd3e5
...
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
...
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
set
(
ANAKIN_LIBRARY
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"path of Anakin library"
)
set
(
ANAKIN_LIBRARY
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"path of Anakin library"
)
set
(
ANAKIN_COMPILE_EXTRA_FLAGS
set
(
ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-Wno-error=unused-variable -Wno-unused-variable
-Wno-error=unused-variable -Wno-unused-variable
-Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment
-Wno-error=comment -Wno-comment
...
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
...
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-reorder
-Wno-error=cpp
)
-Wno-error=cpp
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
3.0/anakin_release_simple
.tar.gz"
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
Version0.1.0/anakin
.tar.gz"
)
# A helper function used in Anakin, currently, to use it, one need to recursively include
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
# nearly all the header files.
...
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
...
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
message
(
STATUS
"Download Anakin library from
${
ANAKIN_LIBRARY_URL
}
"
)
message
(
STATUS
"Download Anakin library from
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"rm -rf
${
ANAKIN_INSTALL_DIR
}
/*"
)
execute_process
(
COMMAND bash -c
"rm -rf
${
ANAKIN_INSTALL_DIR
}
/*"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -
-no-check-certificate -
q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin
_release_simple
.tar.gz"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin.tar.gz"
)
endif
()
endif
()
if
(
WITH_ANAKIN
)
if
(
WITH_ANAKIN
)
...
...
paddle/fluid/API.spec
浏览文件 @
a58dd3e5
...
@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
...
@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...
@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
...
@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a58dd3e5
...
@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
...
@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
else
()
else
()
...
...
paddle/fluid/framework/data_type.cc
浏览文件 @
a58dd3e5
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
using
float16
=
paddle
::
platform
::
float16
;
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
...
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
RegisterType<cc_type>(retv, proto_type, #cc_type)
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
// NOTE: Add your customize type here.
RegType
(
platform
::
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float
,
proto
::
VarType
::
FP32
);
RegType
(
float
,
proto
::
VarType
::
FP32
);
RegType
(
double
,
proto
::
VarType
::
FP64
);
RegType
(
double
,
proto
::
VarType
::
FP64
);
RegType
(
int
,
proto
::
VarType
::
INT32
);
RegType
(
int
,
proto
::
VarType
::
INT32
);
...
...
paddle/fluid/framework/data_type_test.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST
(
DataType
,
float16
)
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
platform
::
float16
;
namespace
f
=
paddle
::
framework
;
f
::
proto
::
VarType
::
Type
dtype
=
f
::
proto
::
VarType
::
FP16
;
Tensor
tensor
;
CPUPlace
cpu
;
tensor
.
mutable_data
(
cpu
,
f
::
ToTypeIndex
(
dtype
));
// test fp16 tensor
EXPECT_EQ
(
tensor
.
type
(),
std
::
type_index
(
typeid
(
float16
)));
// test fp16 size
EXPECT_EQ
(
f
::
SizeOfType
(
f
::
ToTypeIndex
(
dtype
)),
2u
);
// test debug info
std
::
string
type
=
"float16"
;
EXPECT_STREQ
(
f
::
DataTypeToString
(
dtype
).
c_str
(),
type
.
c_str
());
}
paddle/fluid/framework/details/exception_holder.h
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
ExceptionHolder
{
public:
void
Catch
(
const
platform
::
EnforceNotMet
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
exp
));
type_
=
kEnforceNotMet
;
}
void
Catch
(
const
platform
::
EOFException
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
exp
));
type_
=
kEOF
;
}
}
bool
ExceptionCatched
()
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
return
exception_
.
get
()
!=
nullptr
;
}
void
Throw
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
switch
(
type_
)
{
case
kNone
:
break
;
case
kEnforceNotMet
:
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exception_
.
get
());
throw
e
;
break
;
}
case
kEOF
:
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exception_
.
get
());
throw
e
;
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
exception_
.
reset
();
type_
=
kNone
;
}
void
Clear
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
();
type_
=
kNone
;
}
private:
enum
ExceptionType
{
kNone
,
kEnforceNotMet
,
kEOF
};
ExceptionType
type_
{
kNone
};
std
::
unique_ptr
<
std
::
exception
>
exception_
;
mutable
std
::
mutex
mu_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
浏览文件 @
a58dd3e5
...
@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
VariableInfo
>
var_infos
,
std
::
vector
<
platform
::
Place
>
places
,
std
::
vector
<
VariableInfo
>
var_infos
,
std
::
vector
<
platform
::
Place
>
places
,
std
::
unique_ptr
<
SSAGraphExecutor
>&&
underlying_executor
);
std
::
unique_ptr
<
SSAGraphExecutor
>&&
underlying_executor
);
const
ir
::
Graph
&
Graph
()
const
{
return
underlying_executor_
->
Graph
();
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
underlying_executor_
->
Graph
();
}
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
)
override
;
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
)
override
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
a58dd3e5
...
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
...
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
// Clean run context
run_op_futures_
.
clear
();
run_op_futures_
.
clear
();
exception_
.
reset
();
exception_
holder_
.
Clear
();
// Step 3. Execution
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
while
(
!
pending_vars
.
empty
())
{
...
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
...
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
if
(
timeout
)
{
if
(
timeout
)
{
std
::
unique_lock
<
std
::
mutex
>
l
(
exception_mu_
);
if
(
exception_holder_
.
ExceptionCatched
())
{
if
(
exception_
)
{
l
.
unlock
();
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
run_op_future
.
wait
();
}
}
l
.
lock
();
exception_holder_
.
Throw
();
std
::
exception
*
exp
=
exception_
.
get
();
if
(
dynamic_cast
<
platform
::
EOFException
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exp
);
throw
e
;
}
else
if
(
dynamic_cast
<
platform
::
EnforceNotMet
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exp
);
throw
e
;
}
else
{
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
}
else
{
}
else
{
continue
;
continue
;
}
}
...
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
...
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q
->
Extend
(
op
->
Outputs
());
ready_var_q
->
Extend
(
op
->
Outputs
());
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
}
catch
(
platform
::
EOFException
ex
)
{
}
catch
(
platform
::
EOFException
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
exception_holder_
.
Catch
(
ex
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
ex
));
}
}
catch
(
platform
::
EnforceNotMet
ex
)
{
}
catch
(
platform
::
EnforceNotMet
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
exception_holder_
.
Catch
(
ex
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
ex
));
}
catch
(...)
{
}
catch
(...)
{
LOG
(
FATAL
)
<<
"Unknown exception catched"
;
LOG
(
FATAL
)
<<
"Unknown exception catched"
;
}
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
a58dd3e5
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include <functional>
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
...
@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
const
ir
::
Graph
&
Graph
()
const
{
return
*
graph_
;
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
// Use topological sort algorithm
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
...
@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
platform
::
DeviceContextPool
fetch_ctxs_
;
platform
::
DeviceContextPool
fetch_ctxs_
;
std
::
mutex
exception_mu_
;
ExceptionHolder
exception_holder_
;
std
::
unique_ptr
<
std
::
exception
>
exception_
;
std
::
atomic
<
int
>
running_ops_
;
std
::
atomic
<
int
>
running_ops_
;
void
InsertPendingOp
(
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
*
pending_ops
,
void
InsertPendingOp
(
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
*
pending_ops
,
...
...
paddle/fluid/framework/ir/graph_helper_test.cc
浏览文件 @
a58dd3e5
...
@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
...
@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
for
(
size_t
i
=
0
;
i
<
sorted
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sorted
.
size
();
++
i
)
{
node_map
[
sorted
[
i
]
->
Name
()]
=
i
;
node_map
[
sorted
[
i
]
->
Name
()]
=
i
;
}
}
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
);
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
UL
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
UL
);
ASSERT_TRUE
(
node_map
.
at
(
"op3"
)
<
node_map
.
at
(
"op5"
));
ASSERT_TRUE
(
node_map
.
at
(
"op3"
)
<
node_map
.
at
(
"op5"
));
}
}
}
// namespace ir
}
// namespace ir
...
...
paddle/fluid/framework/ir/graph_test.cc
浏览文件 @
a58dd3e5
...
@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
...
@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
std
::
vector
<
ir
::
Node
*>
nodes
(
g
->
Nodes
().
begin
(),
g
->
Nodes
().
end
());
std
::
vector
<
ir
::
Node
*>
nodes
(
g
->
Nodes
().
begin
(),
g
->
Nodes
().
end
());
for
(
ir
::
Node
*
n
:
nodes
)
{
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
n
->
Name
()
==
"sum"
)
{
if
(
n
->
Name
()
==
"sum"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_a"
||
n
->
Name
()
==
"test_b"
||
}
else
if
(
n
->
Name
()
==
"test_a"
||
n
->
Name
()
==
"test_b"
||
n
->
Name
()
==
"test_c"
)
{
n
->
Name
()
==
"test_c"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_out"
)
{
}
else
if
(
n
->
Name
()
==
"test_out"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
UL
);
}
}
}
}
ASSERT_EQ
(
nodes
.
size
(),
5
);
ASSERT_EQ
(
nodes
.
size
(),
5
);
...
...
paddle/fluid/framework/op_kernel_type_test.cc
浏览文件 @
a58dd3e5
...
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
...
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type
),
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type
),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"CUDNN]"
);
"CUDNN]"
);
using
CUDAPlace
=
paddle
::
platform
::
CUDAPlace
;
OpKernelType
op_kernel_type2
(
DataType
::
FP16
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type2
),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"type[CUDNN]"
);
}
}
TEST
(
OpKernelType
,
Hash
)
{
TEST
(
OpKernelType
,
Hash
)
{
...
...
paddle/fluid/framework/op_proto_maker.cc
浏览文件 @
a58dd3e5
...
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
...
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
return
OpProtoAndCheckerMaker
::
VariableBuilder
{
output
};
return
OpProtoAndCheckerMaker
::
VariableBuilder
{
output
};
}
}
void
OpProtoAndCheckerMaker
::
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
)
{
bool
found
=
false
;
proto
::
OpProto
::
Var
*
var
;
for
(
auto
&
var
:
proto_
->
inputs
())
{
if
(
var
.
name
()
==
reused_name
)
{
found
=
true
;
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
found
=
false
;
for
(
int
i
=
0
;
i
<
proto_
->
outputs
().
size
();
++
i
)
{
var
=
proto_
->
mutable_outputs
()
->
Mutable
(
i
);
if
(
var
->
name
()
==
name
)
{
PADDLE_ENFORCE
(
!
var
->
has_reuse
(),
"Output(%s) has been set reused var of %s"
,
name
,
var
->
reuse
());
found
=
true
;
var
->
set_reuse
(
reused_name
);
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
}
void
OpProtoAndCheckerMaker
::
CheckNoDuplicatedInOutAttrs
()
{
void
OpProtoAndCheckerMaker
::
CheckNoDuplicatedInOutAttrs
()
{
std
::
unordered_set
<
std
::
string
>
names
;
std
::
unordered_set
<
std
::
string
>
names
;
auto
checker
=
[
&
](
const
std
::
string
&
name
)
{
auto
checker
=
[
&
](
const
std
::
string
&
name
)
{
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
a58dd3e5
...
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
...
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
VariableBuilder
AddOutput
(
const
std
::
string
&
name
,
VariableBuilder
AddOutput
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
);
const
std
::
string
&
comment
);
void
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
);
template
<
typename
T
>
template
<
typename
T
>
TypedAttrChecker
<
T
>
&
AddAttr
(
const
std
::
string
&
name
,
TypedAttrChecker
<
T
>
&
AddAttr
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
,
const
std
::
string
&
comment
,
...
...
paddle/fluid/framework/op_proto_maker_test.cc
浏览文件 @
a58dd3e5
...
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
...
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
}
}
class
TestInplaceProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
class
TestInplaceProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
).
Reuse
(
"X"
);
}
};
class
TestInplaceProtoMaker2
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
{
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddInput
(
"X"
,
"input of test op"
);
...
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
...
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
};
};
TEST
(
ProtoMaker
,
InplaceOutput
)
{
TEST
(
ProtoMaker
,
InplaceOutput
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
proto
::
OpProto
op_proto
,
op_proto2
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestInplaceProtoMaker
proto_maker
;
TestInplaceProtoMaker
proto_maker
;
ASSERT_THROW
(
proto_maker
(
&
op_proto
,
&
op_checker
),
TestInplaceProtoMaker2
proto_maker2
;
proto_maker
(
&
op_proto
,
&
op_checker
);
ASSERT_THROW
(
proto_maker2
(
&
op_proto2
,
&
op_checker
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
// proto_maker(&op_proto, &op_checker);
// proto_maker.Make();
// ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
}
// normal reuse
class
TestReuseProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddInput
(
"Y"
,
"input of test op"
);
AddOutput
(
"Out"
,
"output of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
);
// avoid destructor exception.
// Validate();
TestReuse
();
}
virtual
void
TestReuse
()
{}
};
// test duplicate reuse error
class
TestReuseProtoMaker2
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"X"
);
Reuse
(
"Out"
,
"Y"
);
}
};
// NotExists Input
class
TestReuseProtoMaker3
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"NotExists"
);
Reuse
(
"XOut"
,
"X"
);
}
};
// NotExists Output
class
TestReuseProtoMaker4
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"NotExists"
,
"X"
);
}
};
TEST
(
ProtoMaker
,
Reuse
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestReuseProtoMaker
proto_maker
;
proto_maker
(
&
op_proto
,
&
op_checker
);
}
// NOTE(dzhwinter):
// There is a Fatal CHECK on base class destructor, which will call abort inside
// instead of
// throw an exception. If we throw an exception in Make(), we will trigger the
// CHECK and terminate the tests.
//
// I had tried to replace the default CHECK with a exception, however, it's
// still not supported by glog.
// the details:
// https://github.com/google/glog/issues/249
// https://github.com/facebookresearch/TensorComprehensions/issues/351
/*
TEST(ProtoMaker, ReuseWithException) {
paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker2 proto_maker2;
TestReuseProtoMaker3 proto_maker3;
TestReuseProtoMaker4 proto_maker4;
EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
paddle::platform::EnforceNotMet);
}
void FailureFunction() {
throw std::runtime_error("Check failed in destructor.");
// return 0;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::InstallFailureFunction(&FailureFunction);
return RUN_ALL_TESTS();
}
*/
paddle/fluid/framework/operator.cc
浏览文件 @
a58dd3e5
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
...
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
...
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
Get
<
LoDTensor
>
().
dims
();
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
DDim
({
-
1
});
}
return
tensor
.
dims
();
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
if
(
get_actual_dim
)
{
if
(
get_actual_dim
)
{
return
var
->
Get
<
SelectedRows
>
().
value
().
dims
();
return
var
->
Get
<
SelectedRows
>
().
value
().
dims
();
...
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
...
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
}
}
}
static
std
::
string
GetDtype
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
return
""
;
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
""
;
}
return
DataTypeToString
(
ToDataType
(
tensor
.
type
()));
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
DataTypeToString
(
ToDataType
(
var
->
Get
<
SelectedRows
>
().
value
().
type
()));
}
else
{
return
""
;
}
}
static
int
GetRowSize
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
static
int
GetRowSize
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
if
(
var
==
nullptr
)
{
...
@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
...
@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
Get
<
LoDTensor
>
().
lod
();
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
default_lod
;
}
return
tensor
.
lod
();
}
else
{
}
else
{
return
default_lod
;
return
default_lod
;
}
}
...
@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
...
@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if
(
row_size
>=
0
)
{
if
(
row_size
>=
0
)
{
ss
<<
"[row_size="
<<
row_size
<<
"]"
;
ss
<<
"[row_size="
<<
row_size
<<
"]"
;
}
}
std
::
string
dtype
=
GetDtype
(
*
scope
,
input
.
second
[
i
]);
ss
<<
":"
<<
dtype
;
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
],
true
)
<<
"]"
;
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
],
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
input
.
second
[
i
])
<<
")"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
input
.
second
[
i
])
<<
")"
;
}
}
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
a58dd3e5
...
@@ -82,7 +82,7 @@ class Tensor {
...
@@ -82,7 +82,7 @@ class Tensor {
template
<
typename
T
>
template
<
typename
T
>
const
T
*
data
()
const
;
const
T
*
data
()
const
;
bool
IsInitialized
()
const
;
inline
bool
IsInitialized
()
const
;
/**
/**
* @brief Return a pointer to mutable memory block.
* @brief Return a pointer to mutable memory block.
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
a58dd3e5
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <string>
#include <string>
#include "paddle/fluid/platform/float16.h"
namespace
framework
=
paddle
::
framework
;
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
namespace
platform
=
paddle
::
platform
;
...
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
...
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
}
}
TEST
(
Tensor
,
FP16
)
{
using
platform
::
float16
;
framework
::
Tensor
src
;
float16
*
src_ptr
=
src
.
mutable_data
<
float16
>
({
2
,
3
},
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
2
*
3
;
++
i
)
{
src_ptr
[
i
]
=
static_cast
<
float16
>
(
i
);
}
EXPECT_EQ
(
src
.
memory_size
(),
2
*
3
*
sizeof
(
float16
));
// EXPECT a human readable error message
// src.data<uint8_t>();
// Tensor holds the wrong type, it holds N6paddle8platform7float16E at
// [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
}
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
a58dd3e5
...
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
...
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
tensorrt_subgraph_node_mark_pass.cc
tensorrt_subgraph_node_mark_pass.cc
analyzer.cc
analyzer.cc
helper.cc
helper.cc
model_store_pass.cc
DEPS framework_proto proto_desc
)
DEPS framework_proto proto_desc
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
...
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
...
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test
(
test_pass_manager SRCS pass_manager_tester.cc
)
inference_analysis_test
(
test_pass_manager SRCS pass_manager_tester.cc
)
inference_analysis_test
(
test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc
)
inference_analysis_test
(
test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
)
inference_analysis_test
(
test_model_store_pass SRCS model_store_pass_tester.cc
)
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
a58dd3e5
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
...
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
...
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string
(
inference_analysis_graphviz_log_root
,
"./"
,
DEFINE_string
(
inference_analysis_graphviz_log_root
,
"./"
,
"Graphviz debuger for data flow graphs."
);
"Graphviz debuger for data flow graphs."
);
DEFINE_string
(
inference_analysis_output_storage_path
,
""
,
"optimized model output path"
);
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
...
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
...
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
AddPass
(
"tensorrt-subgraph"
,
new
TensorRTSubGraphPass
(
trt_teller
));
AddPass
(
"tensorrt-subgraph"
,
new
TensorRTSubGraphPass
(
trt_teller
));
}
}
AddPass
(
"data-flow-graph-to-fluid"
,
new
DataFlowGraphToFluidPass
);
AddPass
(
"data-flow-graph-to-fluid"
,
new
DataFlowGraphToFluidPass
);
if
(
!
FLAGS_inference_analysis_output_storage_path
.
empty
())
{
AddPass
(
"model-store-pass"
,
new
ModelStorePass
);
}
}
}
std
::
string
repr
()
const
override
{
return
"dfg-pass-manager"
;
}
std
::
string
repr
()
const
override
{
return
"dfg-pass-manager"
;
}
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
a58dd3e5
...
@@ -16,28 +16,23 @@ limitations under the License. */
...
@@ -16,28 +16,23 @@ limitations under the License. */
/*
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
* and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
* multiple flags to
* control whether
* control whether an process is applied on the program.
* an process is applied on the program.
*
*
* The processes are called Passes in analysis, the Passes are placed in a
* The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first
* pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
* Fluid ProgramDesc to
* a data flow
* a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
* a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
* graph to a
* pipeline can be any Passes
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
* which take a node or data flow graph as input.
* which take a
* node or data flow graph as input.
*
*
* The Analyzer can be used in two methods, the first is a executable file which
* The Analyzer can be used in two methods, the first is a executable file which
* can be used to
* can be used to pre-process the inference model and can be controlled by
* pre-process the inference model and can be controlled by passing difference
* passing difference command flags;
* command flags;
* the other way is to compose inside the inference API as a runtime pre-process
* the other way is to compose inside the inference API as a runtime pre-process
* phase in the
* phase in the inference service.
* inference service.
*/
*/
#include <gflags/gflags.h>
#include <gflags/gflags.h>
...
@@ -50,6 +45,7 @@ namespace paddle {
...
@@ -50,6 +45,7 @@ namespace paddle {
// flag if not available.
// flag if not available.
DECLARE_bool
(
inference_analysis_enable_tensorrt_subgraph_engine
);
DECLARE_bool
(
inference_analysis_enable_tensorrt_subgraph_engine
);
DECLARE_string
(
inference_analysis_graphviz_log_root
);
DECLARE_string
(
inference_analysis_graphviz_log_root
);
DECLARE_string
(
inference_analysis_output_storage_path
);
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
...
...
paddle/fluid/inference/analysis/analyzer_main.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file implements analysizer -- an executation help to analyze and
* optimize trained model.
*/
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
int
main
(
int
argc
,
char
**
argv
)
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
using
paddle
::
inference
::
analysis
::
Analyzer
;
using
paddle
::
inference
::
analysis
::
Argument
;
Argument
argument
;
Analyzer
analyzer
;
analyzer
.
Run
(
&
argument
);
return
0
;
}
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
a58dd3e5
...
@@ -20,14 +20,18 @@ namespace paddle {
...
@@ -20,14 +20,18 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
TEST
_F
(
DFG_Test
er
,
analysis_without_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_without_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
false
;
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
false
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
analyser
.
Run
(
&
argument
);
}
}
TEST
_F
(
DFG_Test
er
,
analysis_with_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_with_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
true
;
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
true
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
analyser
.
Run
(
&
argument
);
}
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
a58dd3e5
...
@@ -23,6 +23,7 @@
...
@@ -23,6 +23,7 @@
#pragma once
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
...
@@ -36,6 +37,16 @@ namespace analysis {
...
@@ -36,6 +37,16 @@ namespace analysis {
* All the fields should be registered here for clearness.
* All the fields should be registered here for clearness.
*/
*/
struct
Argument
{
struct
Argument
{
Argument
()
=
default
;
explicit
Argument
(
const
std
::
string
&
fluid_model_dir
)
:
fluid_model_dir
(
new
std
::
string
(
fluid_model_dir
))
{}
// The directory of the trained model.
std
::
unique_ptr
<
std
::
string
>
fluid_model_dir
;
// The path of `__model__` and `param`, this is used when the file name of
// model and param is changed.
std
::
unique_ptr
<
std
::
string
>
fluid_model_program_path
;
std
::
unique_ptr
<
std
::
string
>
fluid_model_param_path
;
// The graph that process by the Passes or PassManagers.
// The graph that process by the Passes or PassManagers.
std
::
unique_ptr
<
DataFlowGraph
>
main_dfg
;
std
::
unique_ptr
<
DataFlowGraph
>
main_dfg
;
...
@@ -44,6 +55,9 @@ struct Argument {
...
@@ -44,6 +55,9 @@ struct Argument {
// The processed program desc.
// The processed program desc.
std
::
unique_ptr
<
framework
::
proto
::
ProgramDesc
>
transformed_program_desc
;
std
::
unique_ptr
<
framework
::
proto
::
ProgramDesc
>
transformed_program_desc
;
// The output storage path of ModelStorePass.
std
::
unique_ptr
<
std
::
string
>
model_output_store_path
;
};
};
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
...
...
paddle/fluid/inference/analysis/data_flow_graph.h
浏览文件 @
a58dd3e5
...
@@ -36,6 +36,8 @@ namespace analysis {
...
@@ -36,6 +36,8 @@ namespace analysis {
/*
/*
* DataFlowGraph - A container of Value and Function Nodes.
* DataFlowGraph - A container of Value and Function Nodes.
*
* This is the base graph for any other type of graphs, such as SSA or CFG.
*/
*/
struct
DataFlowGraph
{
struct
DataFlowGraph
{
NodeMap
nodes
;
NodeMap
nodes
;
...
@@ -174,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
...
@@ -174,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
// sub-graph.
std
::
pair
<
std
::
vector
<
Node
*>
,
std
::
vector
<
Node
*>>
std
::
pair
<
std
::
vector
<
Node
*>
,
std
::
vector
<
Node
*>>
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
// NOLINT
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
...
...
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
浏览文件 @
a58dd3e5
...
@@ -20,7 +20,7 @@ namespace inference {
...
@@ -20,7 +20,7 @@ namespace inference {
namespace
analysis
{
namespace
analysis
{
TEST
(
DataFlowGraph
,
BFS
)
{
TEST
(
DataFlowGraph
,
BFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
dfg
.
Build
();
...
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
...
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
}
}
TEST
(
DataFlowGraph
,
DFS
)
{
TEST
(
DataFlowGraph
,
DFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
dfg
.
Build
();
GraphTraits
<
DataFlowGraph
>
trait
(
&
dfg
);
GraphTraits
<
DataFlowGraph
>
trait
(
&
dfg
);
...
...
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
浏览文件 @
a58dd3e5
...
@@ -26,21 +26,21 @@ namespace paddle {
...
@@ -26,21 +26,21 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Test
)
{
TEST
(
DataFlowGraph
,
Test
)
{
DataFlowGraph
graph
;
Argument
argument
(
FLAGS_inference_model_dir
)
;
FluidToDataFlowGraphPass
pass0
;
FluidToDataFlowGraphPass
pass0
;
DataFlowGraphToFluidPass
pass1
;
DataFlowGraphToFluidPass
pass1
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass1
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass1
.
Initialize
(
&
argument
));
pass0
.
Run
(
&
graph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
()
);
pass1
.
Run
(
&
graph
);
pass1
.
Run
(
argument
.
main_dfg
.
get
()
);
pass0
.
Finalize
();
pass0
.
Finalize
();
pass1
.
Finalize
();
pass1
.
Finalize
();
LOG
(
INFO
)
<<
graph
.
nodes
.
size
();
LOG
(
INFO
)
<<
argument
.
main_dfg
->
nodes
.
size
();
}
}
};
// namespace analysis
};
// namespace analysis
...
...
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
浏览文件 @
a58dd3e5
...
@@ -23,12 +23,18 @@ namespace paddle {
...
@@ -23,12 +23,18 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
TEST_F
(
DFG_Tester
,
dfg_graphviz_draw_pass_tester
)
{
TEST
(
DFG_GraphvizDrawPass
,
dfg_graphviz_draw_pass_tester
)
{
auto
dfg
=
ProgramDescToDFG
(
*
argument
.
origin_program_desc
);
Argument
argument
(
FLAGS_inference_model_dir
);
FluidToDataFlowGraphPass
pass0
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
pass0
.
Run
(
argument
.
main_dfg
.
get
());
// auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"test"
);
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"test"
);
DFG_GraphvizDrawPass
pass
(
config
);
DFG_GraphvizDrawPass
pass
(
config
);
pass
.
Initialize
(
&
argument
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
dfg
);
pass
.
Run
(
argument
.
main_dfg
.
get
()
);
// test content
// test content
std
::
ifstream
file
(
"./0-graph_test.dot"
);
std
::
ifstream
file
(
"./0-graph_test.dot"
);
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
浏览文件 @
a58dd3e5
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <glog/logging.h>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -25,8 +26,20 @@ namespace analysis {
...
@@ -25,8 +26,20 @@ namespace analysis {
bool
FluidToDataFlowGraphPass
::
Initialize
(
Argument
*
argument
)
{
bool
FluidToDataFlowGraphPass
::
Initialize
(
Argument
*
argument
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
origin_program_desc
);
if
(
argument
->
origin_program_desc
)
{
PADDLE_ENFORCE
(
argument
);
LOG
(
WARNING
)
<<
"argument's origin_program_desc is already set, might "
"duplicate called"
;
}
if
(
!
argument
->
fluid_model_program_path
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_dir
);
argument
->
fluid_model_program_path
.
reset
(
new
std
::
string
(
*
argument
->
fluid_model_dir
+
"/__model__"
));
}
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_program_path
);
auto
program
=
LoadProgramDesc
(
*
argument
->
fluid_model_program_path
);
argument
->
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
program
));
if
(
!
argument
->
main_dfg
)
{
if
(
!
argument
->
main_dfg
)
{
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
}
}
...
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
...
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
desc_
);
PADDLE_ENFORCE
(
desc_
);
// insert vars
// insert vars
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
auto
&
main_block
=
desc_
->
blocks
(
framework
::
kRootBlockIndex
);
auto
&
main_block
=
desc_
->
blocks
(
framework
::
kRootBlockIndex
);
for
(
int
i
=
0
;
i
<
main_block
.
vars_size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
main_block
.
vars_size
();
i
++
)
{
...
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
...
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
var2id
[
var
.
name
()]
=
v
->
id
();
var2id
[
var
.
name
()]
=
v
->
id
();
}
}
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std
::
unordered_set
<
Node
*>
unique_written_vars
;
for
(
int
i
=
0
;
i
<
main_block
.
ops_size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
main_block
.
ops_size
();
i
++
)
{
const
auto
&
op
=
main_block
.
ops
(
i
);
const
auto
&
op
=
main_block
.
ops
(
i
);
auto
*
o
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kFunction
);
auto
*
o
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kFunction
);
...
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
...
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o
->
SetPbMsg
(
op
.
SerializeAsString
());
o
->
SetPbMsg
(
op
.
SerializeAsString
());
// set inputs and outputs
// set inputs and outputs
std
::
unordered_set
<
Node
*>
inlinks
;
for
(
int
j
=
0
;
j
<
op
.
inputs_size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
op
.
inputs_size
();
j
++
)
{
auto
&
in_var
=
op
.
inputs
(
j
);
auto
&
in_var
=
op
.
inputs
(
j
);
for
(
int
k
=
0
;
k
<
in_var
.
arguments_size
();
k
++
)
{
for
(
int
k
=
0
;
k
<
in_var
.
arguments_size
();
k
++
)
{
auto
*
in
=
graph
->
nodes
.
GetMutable
(
var2id
.
at
(
in_var
.
arguments
(
k
)));
auto
*
in
=
graph
->
nodes
.
GetMutable
(
var2id
.
at
(
in_var
.
arguments
(
k
)));
in
->
outlinks
.
push_back
(
o
);
in
->
outlinks
.
push_back
(
o
);
o
->
inlinks
.
push_back
(
in
);
o
->
inlinks
.
push_back
(
in
);
inlinks
.
insert
(
in
);
}
}
}
}
for
(
int
j
=
0
;
j
<
op
.
outputs_size
();
j
++
)
{
for
(
int
j
=
0
;
j
<
op
.
outputs_size
();
j
++
)
{
auto
&
out_var
=
op
.
outputs
(
j
);
auto
&
out_var
=
op
.
outputs
(
j
);
for
(
int
k
=
0
;
k
<
out_var
.
arguments_size
();
k
++
)
{
for
(
int
k
=
0
;
k
<
out_var
.
arguments_size
();
k
++
)
{
auto
*
out
=
graph
->
nodes
.
GetMutable
(
var2id
[
out_var
.
arguments
(
k
)]);
auto
*
out
=
graph
->
nodes
.
GetMutable
(
var2id
[
out_var
.
arguments
(
k
)]);
if
(
inlink
s
.
count
(
out
))
{
if
(
unique_written_var
s
.
count
(
out
))
{
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto
*
out_alias
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kValue
);
auto
*
out_alias
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kValue
);
out_alias
->
SetName
(
out
->
name
());
out_alias
->
SetName
(
out
->
name
());
out_alias
->
SetPbDesc
(
out
->
pb_desc
());
out_alias
->
SetPbDesc
(
out
->
pb_desc
());
out_alias
->
SetPbMsg
(
out
->
pb_msg
());
out_alias
->
SetPbMsg
(
out
->
pb_msg
());
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update a -> a0
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update variable's alias Node
LOG
(
INFO
)
<<
"loop found in graph, create SSA alias node ["
LOG
(
INFO
)
<<
"loop found in graph, create SSA alias node ["
<<
out_alias
->
repr
()
<<
"] for ["
<<
out
->
repr
()
<<
"]"
;
<<
out_alias
->
repr
()
<<
"] for ["
<<
out
->
repr
()
<<
"]"
;
out
=
out_alias
;
out
=
out_alias
;
}
}
out
->
inlinks
.
push_back
(
o
);
out
->
inlinks
.
push_back
(
o
);
o
->
outlinks
.
push_back
(
out
);
o
->
outlinks
.
push_back
(
out
);
unique_written_vars
.
insert
(
out
);
}
}
}
}
}
}
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
浏览文件 @
a58dd3e5
...
@@ -30,7 +30,7 @@ namespace inference {
...
@@ -30,7 +30,7 @@ namespace inference {
namespace
analysis
{
namespace
analysis
{
/*
/*
* Transform a FluidDesc to a
data flow graph
.
* Transform a FluidDesc to a
SSA
.
*/
*/
class
FluidToDataFlowGraphPass
final
:
public
DataFlowGraphPass
{
class
FluidToDataFlowGraphPass
final
:
public
DataFlowGraphPass
{
public:
public:
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
浏览文件 @
a58dd3e5
...
@@ -21,8 +21,9 @@ namespace paddle {
...
@@ -21,8 +21,9 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Ini
t
)
{
TEST
(
FluidToDataFlowGraphPass
,
Tes
t
)
{
FluidToDataFlowGraphPass
pass
;
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
pass
.
Initialize
(
&
argument
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
pass
.
Run
(
argument
.
main_dfg
.
get
());
// Analysis is sensitive to ProgramDesc, careful to change the original model.
// Analysis is sensitive to ProgramDesc, careful to change the original model.
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
a58dd3e5
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <cstdio>
#include <cstdio>
#include <fstream>
#include <string>
#include <string>
#include <typeindex>
#include <typeindex>
#include <unordered_map>
#include <unordered_map>
...
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
...
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
}
}
}
}
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_path
)
{
std
::
ifstream
fin
(
model_path
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
fin
.
is_open
(),
"Cannot open file %s"
,
model_path
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
std
::
string
buffer
(
fin
.
tellg
(),
' '
);
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
buffer
[
0
],
buffer
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
buffer
);
return
program_desc
;
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/model_store_pass.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
ModelStorePass
::
Run
(
DataFlowGraph
*
x
)
{
if
(
!
argument_
->
fluid_model_param_path
)
{
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
fluid_model_dir
);
argument_
->
fluid_model_param_path
.
reset
(
new
std
::
string
(
*
argument_
->
fluid_model_dir
+
"param"
));
}
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
model_output_store_path
);
// Directly copy param file to destination.
std
::
stringstream
ss
;
// NOTE these commands only works on linux.
ss
<<
"mkdir -p "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
ss
.
str
(
""
);
ss
<<
"cp "
<<
*
argument_
->
fluid_model_dir
<<
"/*"
<<
" "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
// Store program
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
transformed_program_desc
,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."
);
const
std
::
string
program_output_path
=
*
argument_
->
model_output_store_path
+
"/__model__"
;
std
::
ofstream
file
(
program_output_path
,
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"failed to open %s to write."
,
program_output_path
);
const
std
::
string
serialized_message
=
argument_
->
transformed_program_desc
->
SerializeAsString
();
file
.
write
(
serialized_message
.
c_str
(),
serialized_message
.
size
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass.h
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines ModelStorePass, which store the runtime DFG to a Paddle
* model in the disk, and that model can be reloaded for prediction.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
class
ModelStorePass
:
public
DataFlowGraphPass
{
public:
bool
Initialize
(
Argument
*
argument
)
override
{
if
(
!
argument
)
{
LOG
(
ERROR
)
<<
"invalid argument"
;
return
false
;
}
argument_
=
argument
;
return
true
;
}
void
Run
(
DataFlowGraph
*
x
)
override
;
std
::
string
repr
()
const
override
{
return
"DFG-store-pass"
;
}
std
::
string
description
()
const
override
{
return
R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
model in the disk, and that model can be reloaded for prediction again.)DD"
;
}
private:
Argument
*
argument_
{
nullptr
};
};
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass_tester.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
DEFINE_string
(
inference_model_dir
,
""
,
"Model path"
);
TEST
(
DFG_StorePass
,
test
)
{
Analyzer
analyzer
;
Argument
argument
(
FLAGS_inference_model_dir
);
argument
.
model_output_store_path
.
reset
(
new
std
::
string
(
"./_dfg_store_pass_tmp"
));
// disable storage in alalyzer
FLAGS_inference_analysis_output_storage_path
=
""
;
analyzer
.
Run
(
&
argument
);
ModelStorePass
pass
;
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/pass.h
浏览文件 @
a58dd3e5
...
@@ -50,6 +50,7 @@ class Pass {
...
@@ -50,6 +50,7 @@ class Pass {
// Create a debugger Pass that draw the DFG by graphviz toolkit.
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual
Pass
*
CreateGraphvizDebugerPass
()
const
{
return
nullptr
;
}
virtual
Pass
*
CreateGraphvizDebugerPass
()
const
{
return
nullptr
;
}
virtual
void
Run
()
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Node.
// Run on a single Node.
virtual
void
Run
(
Node
*
x
)
{
LOG
(
FATAL
)
<<
"not valid"
;
}
virtual
void
Run
(
Node
*
x
)
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Function.
// Run on a single Function.
...
...
paddle/fluid/inference/analysis/pass_manager_tester.cc
浏览文件 @
a58dd3e5
...
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
...
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
std
::
string
description
()
const
override
{
return
"some doc"
;
}
std
::
string
description
()
const
override
{
return
"some doc"
;
}
};
};
TEST
_F
(
DFG_Test
er
,
DFG_pass_manager
)
{
TEST
(
PassManag
er
,
DFG_pass_manager
)
{
TestDfgPassManager
manager
;
TestDfgPassManager
manager
;
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"dfg.dot"
);
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"dfg.dot"
);
...
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
...
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager
.
Register
(
"graphviz"
,
new
DFG_GraphvizDrawPass
(
config
));
manager
.
Register
(
"graphviz"
,
new
DFG_GraphvizDrawPass
(
config
));
manager
.
Register
(
"dfg-to-fluid"
,
new
DataFlowGraphToFluidPass
);
manager
.
Register
(
"dfg-to-fluid"
,
new
DataFlowGraphToFluidPass
);
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
&
argument
);
ASSERT_TRUE
(
&
argument
);
ASSERT_TRUE
(
manager
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
manager
.
Initialize
(
&
argument
));
manager
.
RunAll
();
manager
.
RunAll
();
}
}
TEST_F
(
DFG_Tester
,
Node_pass_manager
)
{
TEST
(
PassManager
,
Node_pass_manager
)
{
Argument
argument
(
FLAGS_inference_model_dir
);
// Pre-process: initialize the DFG with the ProgramDesc first.
// Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass
pass0
;
FluidToDataFlowGraphPass
pass0
;
pass0
.
Initialize
(
&
argument
);
pass0
.
Initialize
(
&
argument
);
...
...
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
浏览文件 @
a58dd3e5
...
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
...
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
return
false
;
return
false
;
};
};
TEST
_F
(
DFG_Tes
ter
,
Split
)
{
TEST
(
SubGraphSplit
ter
,
Split
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
LOG
(
INFO
)
<<
"spliter
\n
"
<<
dfg
.
DotString
();
LOG
(
INFO
)
<<
"spliter
\n
"
<<
dfg
.
DotString
();
...
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
...
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
ASSERT_EQ
(
subgraphs
.
back
().
size
(),
6UL
);
ASSERT_EQ
(
subgraphs
.
back
().
size
(),
6UL
);
}
}
TEST
_F
(
DFG_Tes
ter
,
Fuse
)
{
TEST
(
SubGraphSplit
ter
,
Fuse
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
size_t
count0
=
dfg
.
nodes
.
size
();
size_t
count0
=
dfg
.
nodes
.
size
();
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
浏览文件 @
a58dd3e5
...
@@ -22,11 +22,11 @@ namespace paddle {
...
@@ -22,11 +22,11 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
tensorrt_subgraph_node_mark_pass
)
{
TEST
(
TensorRTSubgraphNodeMarkPass
,
test
)
{
// init
// init
FluidToDataFlowGraphPass
pass
;
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
pass
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass
.
Initialize
(
&
argument
));
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
pass
.
Run
(
argument
.
main_dfg
.
get
());
TensorRTSubgraphNodeMarkPass
::
teller_t
teller
=
[](
const
Node
*
node
)
{
TensorRTSubgraphNodeMarkPass
::
teller_t
teller
=
[](
const
Node
*
node
)
{
...
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
...
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
for
(
auto
&
node
:
argument
.
main_dfg
->
nodes
.
nodes
())
{
for
(
auto
&
node
:
argument
.
main_dfg
->
nodes
.
nodes
())
{
counter
+=
node
->
attr
(
ATTR_supported_by_tensorrt
).
Bool
();
counter
+=
node
->
attr
(
ATTR_supported_by_tensorrt
).
Bool
();
}
}
ASSERT_EQ
(
counter
,
2
);
LOG
(
INFO
)
<<
counter
<<
" nodes marked"
;
LOG
(
INFO
)
<<
counter
<<
" nodes marked"
;
}
}
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
浏览文件 @
a58dd3e5
...
@@ -25,7 +25,7 @@ namespace analysis {
...
@@ -25,7 +25,7 @@ namespace analysis {
DEFINE_string
(
dot_dir
,
"./"
,
""
);
DEFINE_string
(
dot_dir
,
"./"
,
""
);
TEST
_F
(
DFG_Tester
,
tensorrt_single_pass
)
{
TEST
(
TensorRTSubGraphPass
,
main
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"elementwise_add"
,
"mul"
,
"sigmoid"
});
{
"elementwise_add"
,
"mul"
,
"sigmoid"
});
SubGraphSplitter
::
NodeInsideSubgraphTeller
teller
=
[
&
](
const
Node
*
node
)
{
SubGraphSplitter
::
NodeInsideSubgraphTeller
teller
=
[
&
](
const
Node
*
node
)
{
...
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
...
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
return
false
;
return
false
;
};
};
LOG
(
INFO
)
<<
"init"
;
Argument
argument
(
FLAGS_inference_model_dir
);
DFG_GraphvizDrawPass
::
Config
config
{
FLAGS_dot_dir
,
"origin"
};
DFG_GraphvizDrawPass
::
Config
config
{
FLAGS_dot_dir
,
"origin"
};
DFG_GraphvizDrawPass
::
Config
config1
{
FLAGS_dot_dir
,
"fusion"
};
DFG_GraphvizDrawPass
::
Config
config1
{
FLAGS_dot_dir
,
"fusion"
};
...
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
...
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
FluidToDataFlowGraphPass
pass0
;
FluidToDataFlowGraphPass
pass0
;
TensorRTSubGraphPass
trt_pass
(
std
::
move
(
teller
));
TensorRTSubGraphPass
trt_pass
(
std
::
move
(
teller
));
LOG
(
INFO
)
<<
"Initialize"
;
dfg_pass
.
Initialize
(
&
argument
);
dfg_pass
.
Initialize
(
&
argument
);
dfg_pass1
.
Initialize
(
&
argument
);
dfg_pass1
.
Initialize
(
&
argument
);
pass0
.
Initialize
(
&
argument
);
pass0
.
Initialize
(
&
argument
);
trt_pass
.
Initialize
(
&
argument
);
trt_pass
.
Initialize
(
&
argument
);
LOG
(
INFO
)
<<
"Run"
;
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
());
pass0
.
Run
(
argument
.
main_dfg
.
get
());
dfg_pass
.
Run
(
argument
.
main_dfg
.
get
());
dfg_pass
.
Run
(
argument
.
main_dfg
.
get
());
...
...
paddle/fluid/inference/analysis/ut_helper.h
浏览文件 @
a58dd3e5
...
@@ -20,7 +20,7 @@ limitations under the License. */
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/
ut_
helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -32,27 +32,12 @@ namespace analysis {
...
@@ -32,27 +32,12 @@ namespace analysis {
DEFINE_string
(
inference_model_dir
,
""
,
"inference test model dir"
);
DEFINE_string
(
inference_model_dir
,
""
,
"inference test model dir"
);
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_dir
=
FLAGS_inference_model_dir
)
{
std
::
string
msg
;
std
::
string
net_file
=
FLAGS_inference_model_dir
+
"/__model__"
;
std
::
ifstream
fin
(
net_file
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
net_file
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
msg
.
resize
(
fin
.
tellg
());
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
(
msg
.
at
(
0
)),
msg
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
msg
);
return
program_desc
;
}
static
DataFlowGraph
ProgramDescToDFG
(
static
DataFlowGraph
ProgramDescToDFG
(
const
framework
::
proto
::
ProgramDesc
&
desc
)
{
const
framework
::
proto
::
ProgramDesc
&
desc
)
{
DataFlowGraph
graph
;
DataFlowGraph
graph
;
FluidToDataFlowGraphPass
pass
;
FluidToDataFlowGraphPass
pass
;
Argument
argument
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
pass
.
Initialize
(
&
argument
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
graph
);
pass
.
Run
(
&
graph
);
...
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
...
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
class
DFG_Tester
:
public
::
testing
::
Test
{
class
DFG_Tester
:
public
::
testing
::
Test
{
protected:
protected:
void
SetUp
()
override
{
void
SetUp
()
override
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
);
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
}
}
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a58dd3e5
...
@@ -19,6 +19,7 @@ endif(APPLE)
...
@@ -19,6 +19,7 @@ endif(APPLE)
set
(
inference_deps paddle_inference_api paddle_fluid_api
)
set
(
inference_deps paddle_inference_api paddle_fluid_api
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
endif
()
endif
()
...
@@ -63,6 +64,8 @@ endif()
...
@@ -63,6 +64,8 @@ endif()
if
(
WITH_ANAKIN
)
# only needed in CI
if
(
WITH_ANAKIN
)
# only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
# compile the libinference_anakin_api.a and anakin.so.
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc
)
...
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
...
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=
${
ANAKIN_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
ARGS --model=
${
ANAKIN_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api
)
DEPS inference_anakin_api
_shared
)
target_compile_options
(
inference_anakin_test BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
target_compile_options
(
inference_anakin_test BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
endif
()
endif
()
paddle/fluid/inference/api/api_anakin_engine.cc
浏览文件 @
a58dd3e5
...
@@ -18,26 +18,36 @@
...
@@ -18,26 +18,36 @@
namespace
paddle
{
namespace
paddle
{
PaddleInferenceAnakinPredictor
::
PaddleInferenceAnakinPredictor
(
template
<
typename
Target
>
PaddleInferenceAnakinPredictor
<
Target
>::
PaddleInferenceAnakinPredictor
(
const
AnakinConfig
&
config
)
{
const
AnakinConfig
&
config
)
{
CHECK
(
Init
(
config
));
CHECK
(
Init
(
config
));
}
}
bool
PaddleInferenceAnakinPredictor
::
Init
(
const
AnakinConfig
&
config
)
{
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Init
(
const
AnakinConfig
&
config
)
{
if
(
!
(
graph_
.
load
(
config
.
model_file
)))
{
if
(
!
(
graph_
.
load
(
config
.
model_file
)))
{
LOG
(
FATAL
)
<<
"fail to load graph from "
<<
config
.
model_file
;
return
false
;
return
false
;
}
}
graph_
.
ResetBatchSize
(
"input_0"
,
config
.
max_batch_size
);
auto
inputs
=
graph_
.
get_ins
();
for
(
auto
&
input_str
:
inputs
)
{
graph_
.
ResetBatchSize
(
input_str
,
config
.
max_batch_size
);
}
// optimization for graph
// optimization for graph
if
(
!
(
graph_
.
Optimize
()))
{
if
(
!
(
graph_
.
Optimize
()))
{
return
false
;
return
false
;
}
}
// construct executer
// construct executer
executor_
.
init
(
graph_
);
if
(
executor_p_
==
nullptr
)
{
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
}
return
true
;
return
true
;
}
}
bool
PaddleInferenceAnakinPredictor
::
Run
(
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
for
(
const
auto
&
input
:
inputs
)
{
for
(
const
auto
&
input
:
inputs
)
{
...
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
...
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
<<
"'s type is not float"
;
<<
"'s type is not float"
;
return
false
;
return
false
;
}
}
auto
d_tensor_in_p
=
executor_
.
get_in
(
input
.
name
);
auto
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
auto
net_shape
=
d_tensor_in_p
->
valid_shape
();
if
(
net_shape
.
size
()
!=
input
.
shape
.
size
())
{
LOG
(
ERROR
)
<<
" input "
<<
input
.
name
<<
"'s shape size should be equal to that of net"
;
return
false
;
}
int
sum
=
1
;
for_each
(
input
.
shape
.
begin
(),
input
.
shape
.
end
(),
[
&
](
int
n
)
{
sum
*=
n
;
});
if
(
sum
>
net_shape
.
count
())
{
graph_
.
Reshape
(
input
.
name
,
input
.
shape
);
delete
executor_p_
;
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
}
anakin
::
saber
::
Shape
tmp_shape
;
for
(
auto
s
:
input
.
shape
)
{
tmp_shape
.
push_back
(
s
);
}
d_tensor_in_p
->
reshape
(
tmp_shape
);
float
*
d_data_p
=
d_tensor_in_p
->
mutable_data
();
float
*
d_data_p
=
d_tensor_in_p
->
mutable_data
();
if
(
cudaMemcpy
(
d_data_p
,
static_cast
<
float
*>
(
input
.
data
.
data
()),
if
(
cudaMemcpy
(
d_data_p
,
static_cast
<
float
*>
(
input
.
data
.
data
()),
d_tensor_in_p
->
valid_size
()
*
sizeof
(
float
),
d_tensor_in_p
->
valid_size
()
*
sizeof
(
float
),
...
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
...
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
}
}
cudaStreamSynchronize
(
NULL
);
cudaStreamSynchronize
(
NULL
);
}
}
cudaDeviceSynchronize
();
executor_
.
prediction
();
executor_p_
->
prediction
();
cudaDeviceSynchronize
();
if
(
output_data
->
empty
())
{
if
(
output_data
->
empty
())
{
LOG
(
ERROR
)
<<
"At least one output should be set with tensors' names."
;
LOG
(
ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
return
false
;
}
}
for
(
auto
&
output
:
*
output_data
)
{
for
(
auto
&
output
:
*
output_data
)
{
auto
*
tensor
=
executor_
.
get_out
(
output
.
name
);
auto
*
tensor
=
executor_
p_
->
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
shape
();
output
.
shape
=
tensor
->
valid_
shape
();
if
(
output
.
data
.
length
()
<
tensor
->
valid_size
()
*
sizeof
(
float
))
{
if
(
output
.
data
.
length
()
<
tensor
->
valid_size
()
*
sizeof
(
float
))
{
output
.
data
.
Resize
(
tensor
->
valid_size
()
*
sizeof
(
float
));
output
.
data
.
Resize
(
tensor
->
valid_size
()
*
sizeof
(
float
));
}
}
...
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
...
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
return
true
;
return
true
;
}
}
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
template
<
typename
Target
>
&
PaddleInferenceAnakinPredictor
::
get_executer
()
{
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
return
executor_
;
&
PaddleInferenceAnakinPredictor
<
Target
>::
get_executer
()
{
return
*
executor_p_
;
}
}
// the cloned new Predictor of anakin share the same net weights from original
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
// Predictor
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
::
Clone
()
{
template
<
typename
Target
>
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
<
Target
>::
Clone
()
{
VLOG
(
3
)
<<
"Anakin Predictor::clone"
;
VLOG
(
3
)
<<
"Anakin Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
());
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
<
Target
>
());
// construct executer from other graph
// construct executer from other graph
auto
anakin_predictor_p
=
auto
anakin_predictor_p
=
dynamic_cast
<
PaddleInferenceAnakinPredictor
*>
(
cls
.
get
());
dynamic_cast
<
PaddleInferenceAnakinPredictor
<
Target
>
*>
(
cls
.
get
());
if
(
!
anakin_predictor_p
)
{
if
(
!
anakin_predictor_p
)
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
return
nullptr
;
...
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
...
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return
std
::
move
(
cls
);
return
std
::
move
(
cls
);
}
}
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>;
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>;
// A factory to help create difference predictor.
// A factory to help create difference predictor.
template
<
>
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
const
AnakinConfig
&
config
)
{
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
const
AnakinConfig
&
config
)
{
VLOG
(
3
)
<<
"Anakin Predictor create."
;
VLOG
(
3
)
<<
"Anakin Predictor create."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
if
(
config
.
target_type
==
AnakinConfig
::
NVGPU
)
{
new
PaddleInferenceAnakinPredictor
(
config
));
VLOG
(
3
)
<<
"Anakin Predictor create on [ NVIDIA GPU ]."
;
return
x
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
}
new
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>
(
config
));
return
x
;
}
else
if
(
config
.
target_type
==
AnakinConfig
::
X86
)
{
VLOG
(
3
)
<<
"Anakin Predictor create on [ Intel X86 ]."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>
(
config
));
return
x
;
}
else
{
VLOG
(
3
)
<<
"Anakin Predictor create on unknown platform."
;
return
nullptr
;
}
};
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/api_anakin_engine.h
浏览文件 @
a58dd3e5
...
@@ -20,14 +20,16 @@ limitations under the License. */
...
@@ -20,14 +20,16 @@ limitations under the License. */
#pragma once
#pragma once
#include <vector>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h"
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "saber/core/shape.h"
#include "saber/saber_types.h"
#include "saber/saber_types.h"
namespace
paddle
{
namespace
paddle
{
template
<
typename
Target
>
class
PaddleInferenceAnakinPredictor
:
public
PaddlePredictor
{
class
PaddleInferenceAnakinPredictor
:
public
PaddlePredictor
{
public:
public:
PaddleInferenceAnakinPredictor
()
{}
PaddleInferenceAnakinPredictor
()
{}
...
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
...
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
get_executer
();
get_executer
();
~
PaddleInferenceAnakinPredictor
()
override
{};
~
PaddleInferenceAnakinPredictor
()
override
{
delete
executor_p_
;
executor_p_
=
nullptr
;
};
private:
private:
bool
Init
(
const
AnakinConfig
&
config
);
bool
Init
(
const
AnakinConfig
&
config
);
anakin
::
graph
::
Graph
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
graph
::
Graph
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
anakin
::
Precision
::
FP32
>
graph_
;
graph_
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>*
executor_
;
executor_
p_
{
nullptr
}
;
AnakinConfig
config_
;
AnakinConfig
config_
;
};
};
...
...
paddle/fluid/inference/api/api_anakin_engine_tester.cc
浏览文件 @
a58dd3e5
...
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string
(
model
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model
,
""
,
"Directory of the inference model
(mobile_v2)
."
);
namespace
paddle
{
namespace
paddle
{
AnakinConfig
GetConfig
()
{
AnakinConfig
GetConfig
()
{
AnakinConfig
config
;
AnakinConfig
config
;
// using AnakinConfig::X86 if you need to use cpu to do inference
config
.
target_type
=
AnakinConfig
::
NVGPU
;
config
.
model_file
=
FLAGS_model
;
config
.
model_file
=
FLAGS_model
;
config
.
device
=
0
;
config
.
device
=
0
;
config
.
max_batch_size
=
1
;
config
.
max_batch_size
=
1
;
...
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
...
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
config
);
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
config
);
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
PaddleTensor
tensor
;
PaddleTensor
tensor
{.
name
=
"input_0"
,
tensor
.
name
=
"input_0"
;
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
}),
tensor
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
});
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
.
dtype
=
PaddleDType
::
FLOAT32
}
;
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
// For simplicity, we set all the slots with the same data.
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
;
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
paddle_tensor_feeds
.
emplace_back
(
std
::
move
(
tensor
));
PaddleTensor
tensor_out
{.
name
=
"prob_out"
,
PaddleTensor
tensor_out
;
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
}),
tensor_out
.
name
=
"prob_out"
;
.
data
=
PaddleBuf
(),
tensor_out
.
shape
=
std
::
vector
<
int
>
({});
.
dtype
=
PaddleDType
::
FLOAT32
};
tensor_out
.
data
=
PaddleBuf
();
tensor_out
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
outputs
.
emplace_back
(
std
::
move
(
tensor_out
));
ASSERT_TRUE
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
ASSERT_TRUE
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
for
(
size_t
j
=
0
;
j
<
1000
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
outputs
[
0
].
data
.
length
()
;
++
j
)
{
LOG
(
INFO
)
<<
"output["
<<
j
<<
"]: "
<<
data_o
[
j
];
LOG
(
INFO
)
<<
"output["
<<
j
<<
"]: "
<<
data_o
[
j
];
}
}
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
a58dd3e5
...
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
...
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
inputs
[
i
].
data
.
length
());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
lod
.
emplace_back
(
level
);
}
input
.
set_lod
(
lod
);
feeds
->
push_back
(
input
);
feeds
->
push_back
(
input
);
}
}
return
true
;
return
true
;
...
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
...
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
buffer
.
Resize
(
sizeof
(
float
)
*
data
.
size
());
buffer
.
Resize
(
sizeof
(
float
)
*
data
.
size
());
}
}
std
::
memcpy
(
buffer
.
data
(),
data
.
data
(),
buffer
.
length
());
std
::
memcpy
(
buffer
.
data
(),
data
.
data
(),
buffer
.
length
());
// copy LoD
for
(
const
auto
&
level
:
fetchs
[
i
].
lod
())
{
outputs
->
at
(
i
).
lod
.
emplace_back
(
level
);
}
outputs
->
at
(
i
).
dtype
=
PaddleDType
::
FLOAT32
;
outputs
->
at
(
i
).
dtype
=
PaddleDType
::
FLOAT32
;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
}
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
a58dd3e5
...
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
...
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void
OptimizeInferenceProgram
()
{
void
OptimizeInferenceProgram
()
{
// Analyze inference_program
// Analyze inference_program
Argument
argument
;
Argument
argument
;
if
(
!
config_
.
model_dir
.
empty
())
{
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
config_
.
model_dir
));
}
else
{
PADDLE_ENFORCE
(
!
config_
.
param_file
.
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
.
empty
());
argument
.
fluid_model_program_path
.
reset
(
new
std
::
string
(
config_
.
prog_file
));
argument
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
argument
.
origin_program_desc
.
reset
(
argument
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
a58dd3e5
...
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
...
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
std
::
vector
<
int64_t
>
data
(
20
);
std
::
vector
<
int64_t
>
data
(
20
);
for
(
int
i
=
0
;
i
<
20
;
i
++
)
data
[
i
]
=
i
;
for
(
int
i
=
0
;
i
<
20
;
i
++
)
data
[
i
]
=
i
;
PaddleTensor
tensor
{
PaddleTensor
tensor
;
.
name
=
""
,
tensor
.
shape
=
std
::
vector
<
int
>
({
10
,
1
});
.
shape
=
std
::
vector
<
int
>
({
10
,
1
}),
tensor
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
));
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
)),
tensor
.
dtype
=
PaddleDType
::
INT64
;
.
dtype
=
PaddleDType
::
INT64
};
// For simplicity, we set all the slots with the same data.
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
...
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
浏览文件 @
a58dd3e5
...
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
...
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
//# 2. Prepare input.
//# 2. Prepare input.
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
PaddleTensor
tensor
;
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
.
dtype
=
PaddleDType
::
INT64
}
;
tensor
.
dtype
=
PaddleDType
::
INT64
;
// For simplicity, we set all the slots with the same data.
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
...
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
// 2. Dummy Input Data
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
PaddleTensor
tensor
;
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
.
dtype
=
PaddleDType
::
INT64
};
tensor
.
dtype
=
PaddleDType
::
INT64
;
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
;
// 3. Run
// 3. Run
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
a58dd3e5
...
@@ -20,8 +20,8 @@ limitations under the License. */
...
@@ -20,8 +20,8 @@ limitations under the License. */
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream>
#include <fstream>
#include <iostream>
#include <iostream>
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "utils.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
...
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
...
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
file
.
close
();
file
.
close
();
// Inference.
// Inference.
PaddleTensor
input
{
PaddleTensor
input
;
.
name
=
"xx"
,
input
.
shape
=
record
.
shape
;
.
shape
=
record
.
shape
,
input
.
data
=
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
)),
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
.
dtype
=
PaddleDType
::
FLOAT32
}
;
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
std
::
vector
<
PaddleTensor
>
output
;
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
a58dd3e5
...
@@ -44,7 +44,7 @@ class PaddleBuf {
...
@@ -44,7 +44,7 @@ class PaddleBuf {
PaddleBuf
(
void
*
data
,
size_t
length
)
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Own memory.
// Own memory.
PaddleBuf
(
size_t
length
)
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Resize to `length` bytes.
// Resize to `length` bytes.
void
Resize
(
size_t
length
);
void
Resize
(
size_t
length
);
...
@@ -67,9 +67,9 @@ struct PaddleTensor {
...
@@ -67,9 +67,9 @@ struct PaddleTensor {
PaddleTensor
()
=
default
;
PaddleTensor
()
=
default
;
std
::
string
name
;
// variable name.
std
::
string
name
;
// variable name.
std
::
vector
<
int
>
shape
;
std
::
vector
<
int
>
shape
;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf
data
;
// blob of data.
PaddleBuf
data
;
// blob of data.
PaddleDType
dtype
;
PaddleDType
dtype
;
std
::
vector
<
std
::
vector
<
uint64_t
>>
lod
;
// lod data
};
};
enum
class
PaddleEngineKind
{
enum
class
PaddleEngineKind
{
...
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
...
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
// Configurations for Anakin engine.
// Configurations for Anakin engine.
struct
AnakinConfig
:
public
PaddlePredictor
::
Config
{
struct
AnakinConfig
:
public
PaddlePredictor
::
Config
{
enum
TargetType
{
NVGPU
=
0
,
X86
};
int
device
;
int
device
;
std
::
string
model_file
;
std
::
string
model_file
;
int
max_batch_size
{
-
1
};
int
max_batch_size
{
-
1
};
TargetType
target_type
;
};
};
struct
TensorRTConfig
:
public
NativeConfig
{
struct
TensorRTConfig
:
public
NativeConfig
{
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
a58dd3e5
# Add TRT tests
# Add TRT tests
nv_library
(
tensorrt_converter
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
elementwise_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
...
@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine activation_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine activation_op SERIAL
)
nv_test
(
test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine conv_op SERIAL
)
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine elementwise_add_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
a58dd3e5
...
@@ -20,11 +20,60 @@ namespace tensorrt {
...
@@ -20,11 +20,60 @@ namespace tensorrt {
class
Conv2dOpConverter
:
public
OpConverter
{
class
Conv2dOpConverter
:
public
OpConverter
{
public:
public:
Conv2dOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
LOG
(
INFO
)
LOG
(
INFO
)
<<
"convert a fluid conv2d op to tensorrt conv layer without bias"
;
<<
"convert a fluid conv2d op to tensorrt conv layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Input"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Filter"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Output"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
).
front
());
// Declare weights
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
auto
*
Y_t
=
Y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
weight_data
=
Y_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
PADDLE_ENFORCE_EQ
(
Y_t
->
dims
().
size
(),
4UL
);
const
int
n_output
=
Y_t
->
dims
()[
0
];
const
int
filter_h
=
Y_t
->
dims
()[
2
];
const
int
filter_w
=
Y_t
->
dims
()[
3
];
const
int
groups
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"groups"
));
const
std
::
vector
<
int
>
dilations
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"dilations"
));
const
std
::
vector
<
int
>
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
nvinfer1
::
DimsHW
nv_ksize
(
filter_h
,
filter_w
);
nvinfer1
::
DimsHW
nv_dilations
(
dilations
[
0
],
dilations
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
TensorRTEngine
::
Weight
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
n_output
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
layer
!=
nullptr
);
layer
->
setStride
(
nv_strides
);
layer
->
setPadding
(
nv_paddings
);
layer
->
setDilation
(
nv_dilations
);
layer
->
setNbGroups
(
groups
);
auto
output_name
=
op_desc
.
Output
(
"Output"
).
front
();
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
}
}
};
};
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
ElementwiseWeightOpConverter
:
public
OpConverter
{
public:
ElementwiseWeightOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
LOG
(
INFO
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
).
front
());
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Y"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
auto
*
Y_t
=
Y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
weight_data
=
Y_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
auto
scale_mode
=
nvinfer1
::
ScaleMode
::
kELEMENTWISE
;
std
::
vector
<
int
>
dims_y
=
framework
::
vectorize2int
(
Y_t
->
dims
());
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
dims_x
.
nbDims
+
1
)
{
if
(
dims_y
[
0
]
==
1
)
dims_y
.
erase
(
dims_y
.
begin
());
}
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
1
&&
dims_y
[
0
]
==
dims_x
.
d
[
0
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kCHANNEL
;
}
else
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
dims_x
.
nbDims
&&
dims_y
[
0
]
==
dims_x
.
d
[
0
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kELEMENTWISE
;
for
(
int
i
=
1
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_y
[
i
]
!=
dims_x
.
d
[
i
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kCHANNEL
;
break
;
}
}
if
(
scale_mode
==
nvinfer1
::
ScaleMode
::
kCHANNEL
)
{
for
(
int
i
=
1
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_y
[
i
]
!=
1
)
PADDLE_THROW
(
"TensorRT unsupported weight shape for Elementwise op!"
);
}
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported weight Shape for Elementwise op!"
);
}
TensorRTEngine
::
Weight
shift_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
TensorRTEngine
::
Weight
scale_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
nvinfer1
::
IScaleLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
public:
ElementwiseTensorOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
LOG
(
INFO
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
).
front
());
auto
*
Y
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Y"
).
front
());
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
nvinfer1
::
Dims
dims_y
=
Y
->
getDimensions
();
// The two input tensor should have the same dims
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
if
(
dims_x
.
nbDims
==
dims_y
.
nbDims
)
{
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
auto
op_pair
=
ops
.
find
(
op_type_
);
if
(
op_pair
==
ops
.
end
())
{
PADDLE_THROW
(
"Wrong elementwise op type!"
);
}
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
protected:
static
const
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ElementWiseOperation
>
ops
;
std
::
string
op_type_
;
};
const
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ElementWiseOperation
>
ElementwiseTensorOpConverter
::
ops
=
{
{
"add"
,
nvinfer1
::
ElementWiseOperation
::
kSUM
},
{
"mul"
,
nvinfer1
::
ElementWiseOperation
::
kPROD
},
{
"sub"
,
nvinfer1
::
ElementWiseOperation
::
kSUB
},
{
"div"
,
nvinfer1
::
ElementWiseOperation
::
kDIV
},
{
"min"
,
nvinfer1
::
ElementWiseOperation
::
kMIN
},
{
"pow"
,
nvinfer1
::
ElementWiseOperation
::
kPOW
},
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
};
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
};
class
ElementwiseTensorMulOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMulOpConverter
()
{
op_type_
=
"mul"
;
}
};
class
ElementwiseTensorSubOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorSubOpConverter
()
{
op_type_
=
"sub"
;
}
};
class
ElementwiseTensorDivOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorDivOpConverter
()
{
op_type_
=
"div"
;
}
};
class
ElementwiseTensorMinOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMinOpConverter
()
{
op_type_
=
"min"
;
}
};
class
ElementwiseTensorMaxOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMaxOpConverter
()
{
op_type_
=
"max"
;
}
};
class
ElementwiseTensorPowOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorPowOpConverter
()
{
op_type_
=
"pow"
;
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
ElementwiseTensorAddOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_sub_tensor
,
ElementwiseTensorSubOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_div_tensor
,
ElementwiseTensorDivOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_mul_tensor
,
ElementwiseTensorMulOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_max_tensor
,
ElementwiseTensorMaxOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_min_tensor
,
ElementwiseTensorMinOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_pow_tensor
,
ElementwiseTensorPowOpConverter
);
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
a58dd3e5
...
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
...
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
}
}
// indata c * k
// indata c * k
// Reorder the data layout from CK to KC.
// Reorder the data layout from CK to KC.
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
// NOLINT
TensorRTEngine
::
Weight
*
oweights
)
{
TensorRTEngine
::
Weight
*
oweights
)
{
int
c
=
iweights
.
dims
[
0
];
int
c
=
iweights
.
dims
[
0
];
int
k
=
iweights
.
dims
[
1
];
int
k
=
iweights
.
dims
[
1
];
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
a58dd3e5
...
@@ -55,6 +55,31 @@ class OpConverter {
...
@@ -55,6 +55,31 @@ class OpConverter {
it
=
Registry
<
OpConverter
>::
Lookup
(
"fc"
);
it
=
Registry
<
OpConverter
>::
Lookup
(
"fc"
);
}
}
}
}
if
(
op_desc
.
Type
().
find
(
"elementwise"
)
!=
std
::
string
::
npos
)
{
static
std
::
unordered_set
<
std
::
string
>
add_tensor_op_set
{
"add"
,
"mul"
,
"sub"
,
"div"
,
"max"
,
"min"
,
"pow"
};
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
};
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1UL
);
int
op_type_len
=
op_desc
.
Type
().
size
();
std
::
string
op_type
=
op_desc
.
Type
().
substr
(
op_type_len
-
3
,
op_type_len
);
std
::
string
Y
=
op_desc
.
Input
(
"Y"
)[
0
];
if
(
parameters
.
count
(
Y
))
{
PADDLE_ENFORCE
(
add_weight_op_set
.
count
(
op_type
)
>
0
,
"Unsupported elementwise type"
+
op_type
);
it
=
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_weight"
);
}
else
{
PADDLE_ENFORCE
(
add_tensor_op_set
.
count
(
op_type
)
>
0
,
"Unsupported elementwise type"
+
op_type
);
it
=
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_tensor"
);
}
}
if
(
!
it
)
{
if
(
!
it
)
{
it
=
Registry
<
OpConverter
>::
Lookup
(
op_desc
.
Type
());
it
=
Registry
<
OpConverter
>::
Lookup
(
op_desc
.
Type
());
}
}
...
...
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
conv2d_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"conv2d-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
5
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"conv2d-X"
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
validator
.
DeclParamVar
(
"conv2d-Y"
,
nvinfer1
::
Dims4
(
3
,
2
,
3
,
3
));
validator
.
DeclOutputVar
(
"conv2d-Out"
,
nvinfer1
::
Dims3
(
3
,
5
,
5
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"conv2d"
);
desc
.
SetInput
(
"Input"
,
{
"conv2d-X"
});
desc
.
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
desc
.
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"dilations"
,
dilations
);
desc
.
SetAttr
(
"groups"
,
groups
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
3
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
conv2d
);
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
elementwise_op
,
add_weight_test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"elementwise_add-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclParamVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add-Out"
});
int
axis
=
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
8
);
}
TEST
(
elementwise_op
,
add_tensor_test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
8
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add-Out"
});
// the defalut axis of elementwise op is -1
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
8
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
elementwise_add
);
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
浏览文件 @
a58dd3e5
...
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
...
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
framework
::
ProgramDesc
prog
;
framework
::
ProgramDesc
prog
;
auto
*
block
=
prog
.
MutableBlock
(
0
);
auto
*
block
=
prog
.
MutableBlock
(
0
);
auto
*
conv2d_op
=
block
->
AppendOp
();
auto
*
conv2d_op
=
block
->
AppendOp
();
// init trt engine
cudaStream_t
stream_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
engine_
.
reset
(
new
TensorRTEngine
(
5
,
1
<<
15
,
&
stream_
));
engine_
->
InitNetwork
();
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
->
DeclareInput
(
"conv2d-X"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
conv2d_op
->
SetType
(
"conv2d"
);
conv2d_op
->
SetType
(
"conv2d"
);
conv2d_op
->
SetInput
(
"Input"
,
{
"conv2d-X"
});
conv2d_op
->
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
conv2d_op
->
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
OpConverter
converter
;
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
conv2d_op
->
SetAttr
(
"strides"
,
strides
);
conv2d_op
->
SetAttr
(
"paddings"
,
paddings
);
conv2d_op
->
SetAttr
(
"dilations"
,
dilations
);
conv2d_op
->
SetAttr
(
"groups"
,
groups
);
// init scope
framework
::
Scope
scope
;
framework
::
Scope
scope
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{},
scope
,
std
::
vector
<
int
>
dim_vec
=
{
3
,
2
,
3
,
3
};
nullptr
/*TensorRTEngine*/
);
auto
*
x
=
scope
.
Var
(
"conv2d-Y"
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
OpConverter
converter
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{
"conv2d-Y"
},
scope
,
engine_
.
get
()
/*TensorRTEngine*/
);
}
}
}
// namespace tensorrt
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
a58dd3e5
...
@@ -149,7 +149,7 @@ class TRTConvertValidation {
...
@@ -149,7 +149,7 @@ class TRTConvertValidation {
cudaStreamSynchronize
(
*
engine_
->
stream
());
cudaStreamSynchronize
(
*
engine_
->
stream
());
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
const
size_t
output_space_size
=
2
000
;
const
size_t
output_space_size
=
3
000
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
trt_out
(
output_space_size
);
std
::
vector
<
float
>
trt_out
(
output_space_size
);
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
a58dd3e5
...
@@ -20,10 +20,10 @@ limitations under the License. */
...
@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/float16.h"
DEFINE_bool
(
cudnn_deterministic
,
tru
e
,
DEFINE_bool
(
cudnn_deterministic
,
fals
e
,
"Whether allow using an autotuning algorithm for convolution "
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"operator. The autotuning algorithm may be non-deterministic. If "
"
fals
e, the algorithm is deterministic."
);
"
tru
e, the algorithm is deterministic."
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
...
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
if
(
input_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
handle
,
cudnn_filter_desc
,
...
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
...
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
}
if
(
filter_grad
)
{
if
(
filter_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
a58dd3e5
...
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
...
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
...
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
...
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
...
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
...
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
// Generate keys for storing/retriving primitives for this operator
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
// NOLINT
memory
::
dims
&
weights_dims
,
memory
::
dims
&
weights_dims
,
// NOLINT
std
::
vector
<
int
>&
strides
,
std
::
vector
<
int
>&
strides
,
// NOLINT
std
::
vector
<
int
>&
paddings
,
std
::
vector
<
int
>&
paddings
,
// NOLINT
std
::
vector
<
int
>&
dilations
,
int
groups
,
std
::
vector
<
int
>&
dilations
,
// NOLINT
const
std
::
string
&
suffix
)
{
int
groups
,
const
std
::
string
&
suffix
)
{
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
suffix
;
suffix
;
...
...
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
浏览文件 @
a58dd3e5
...
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
x_dims
=
x
->
dims
();
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
auto
y_dims
_untrimed
=
y
->
dims
();
auto
z_dims
=
z
->
dims
();
auto
z_dims
=
z
->
dims
();
// Execute default elementwise_add operator when
// Execute default elementwise_add operator when
// broadcast operations need to performed.
// broadcast operations need to performed.
if
(
x_dims
!=
y_dims
)
{
if
(
x_dims
!=
y_dims
_untrimed
)
{
auto
sum_func
=
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
};
auto
sum_func
=
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
};
TransformFunctor
<
decltype
(
sum_func
),
T
,
TransformFunctor
<
decltype
(
sum_func
),
T
,
...
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
ctx
.
template
device_context
<
paddle
::
platform
::
CPUDeviceContext
>(),
ctx
.
template
device_context
<
paddle
::
platform
::
CPUDeviceContext
>(),
sum_func
);
sum_func
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
int
pre
,
n
,
post
;
...
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
"Wrong layout/format set for Y tensor"
);
"Wrong layout/format set for Y tensor"
);
std
::
vector
<
int
>
src_x_tz
=
framework
::
vectorize2int
(
x_dims
);
std
::
vector
<
int
>
src_x_tz
=
framework
::
vectorize2int
(
x_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
_untrimed
);
std
::
vector
<
int
>
dst_tz
=
framework
::
vectorize2int
(
z_dims
);
std
::
vector
<
int
>
dst_tz
=
framework
::
vectorize2int
(
z_dims
);
std
::
vector
<
memory
::
primitive_desc
>
srcs_pd
;
std
::
vector
<
memory
::
primitive_desc
>
srcs_pd
;
...
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
...
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
// skip out, x, y,
// dout length is larger or equal than dx, dy.
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
auto
set_mkldnn_format
=
[](
Tensor
*
in
,
const
Tensor
*
out
)
{
auto
set_mkldnn_format
=
[](
Tensor
*
in
,
const
Tensor
*
out
)
{
in
->
set_layout
(
DataLayout
::
kMKLDNN
);
in
->
set_layout
(
DataLayout
::
kMKLDNN
);
in
->
set_format
(
out
->
format
());
in
->
set_format
(
out
->
format
());
};
};
if
(
x
->
dims
()
==
y
->
dims
())
{
if
(
dx
!=
nullptr
&&
dy
!=
nullptr
&&
dx
->
dims
()
==
dy
->
dims
())
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
if
(
dx
->
dims
()
==
dy
->
dims
())
{
if
(
dx
)
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
set_mkldnn_format
(
dx
,
dout
);
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
set_mkldnn_format
(
dx
,
dout
);
}
if
(
dy
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
set_mkldnn_format
(
dy
,
dout
);
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dy
,
dout
);
}
}
}
}
else
{
}
else
{
// Execute default kernel when broadcast is needed
// Execute default kernel when broadcast is needed
ElemwiseGradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
Elemwise
Explicit
GradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
IdentityGrad
<
T
>
());
}
}
...
...
paddle/fluid/operators/elementwise_add_op.cc
浏览文件 @
a58dd3e5
...
@@ -15,7 +15,9 @@ limitations under the License. */
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_add
,
Add
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
elementwise_add
,
elementwise_add
,
ops
::
ElementwiseAddKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ElementwiseAddKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
a58dd3e5
...
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
...
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
framework
::
Tensor
*
dy
)
{
framework
::
Tensor
*
dy
)
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
IdentityGrad
<
T
>
());
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
...
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
...
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
(
x
->
dims
()
==
y
->
dims
()))
{
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
(
dx
->
dims
()
==
dy
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
else
{
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
...
...
paddle/fluid/operators/elementwise_div_op.cc
浏览文件 @
a58dd3e5
...
@@ -15,7 +15,9 @@ limitations under the License. */
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_div
,
"Div"
,
"Out = X / Y"
);
REGISTER_ELEMWISE_OP
(
elementwise_div
,
"Div"
,
"Out = X / Y"
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
elementwise_div
,
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_op.h
浏览文件 @
a58dd3e5
...
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
void
Make
()
final
{
void
Make
()
final
{
AddInput
(
"X"
,
"(Tensor), The first input tensor of elementwise op."
);
AddInput
(
"X"
,
"(Tensor), The first input tensor of elementwise op."
);
AddInput
(
"Y"
,
"(Tensor), The second input tensor of elementwise op."
);
AddInput
(
"Y"
,
"(Tensor), The second input tensor of elementwise op."
);
AddOutput
(
"Out"
,
"The output of elementwise op."
).
Reuse
(
"X"
);
// AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
// memory.").AsIntermediate();
AddOutput
(
"Out"
,
"The output of elementwise op."
);
AddAttr
<
int
>
(
"axis"
,
AddAttr
<
int
>
(
"axis"
,
"(int, default -1). The start dimension index "
"(int, default -1). The start dimension index "
"for broadcasting Y onto X."
)
"for broadcasting Y onto X."
)
...
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
...
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
)DOC"
,
)DOC"
,
GetName
(),
GetEquation
()));
GetName
(),
GetEquation
()));
SetReuse
();
}
}
protected:
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetEquation
()
const
=
0
;
virtual
std
::
string
GetEquation
()
const
=
0
;
virtual
void
SetReuse
()
{}
};
};
class
ElementwiseOpGrad
:
public
framework
::
OperatorWithKernel
{
class
ElementwiseOpGrad
:
public
framework
::
OperatorWithKernel
{
...
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
...
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
auto
input_data_type
=
framework
::
ToDataType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
if
(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
if
(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
...
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
...
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
}
};
};
// For Add, Sub op, the X, Out is not needed.
class
ElementwiseOpExplicitGrad
:
public
ElementwiseOpGrad
{
public:
using
operators
::
ElementwiseOpGrad
::
ElementwiseOpGrad
;
using
operators
::
ElementwiseOpGrad
::
GetExpectedKernelType
;
using
Tensor
=
framework
::
Tensor
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
auto
out_dims
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
));
ctx
->
SetOutputDim
(
x_grad_name
,
out_dims
);
}
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) should not be null"
);
auto
y_dims
=
ctx
->
GetInputDim
(
"Y"
);
ctx
->
SetOutputDim
(
y_grad_name
,
y_dims
);
}
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
/*
*/
#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \
class kernel_type##GradMaker \
: public paddle::framework::SingleGradOpDescMaker { \
public: \
using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
\
protected: \
std::unique_ptr<paddle::framework::OpDesc> Apply() const override { \
auto* op = new paddle::framework::OpDesc(); \
op->SetType(#kernel_type "_grad"); \
op->SetInput("Y", Input("Y")); \
op->SetInput(::paddle::framework::GradVarName("Out"), \
OutputGrad("Out")); \
op->SetAttrMap(Attrs()); \
op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \
op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \
return std::unique_ptr<::paddle::framework::OpDesc>(op); \
} \
}
#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \
#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \
class __ElemwiseOp##op_type##Maker__ \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
: public ::paddle::operators::ElementwiseOpMaker { \
...
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
...
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
::paddle::operators::ElementwiseOpInferVarType, \
::paddle::operators::ElementwiseOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
virtual void SetReuse() { Reuse(__VA_ARGS__); } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
a58dd3e5
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <glog/logging.h>
#include <algorithm>
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
...
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
...
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
}
}
}
}
inline
void
trim_trailing_singular_dims
(
framework
::
DDim
*
dims
)
{
inline
framework
::
DDim
trim_trailing_singular_dims
(
const
framework
::
DDim
&
dims
)
{
// Remove trailing dimensions of size 1 for y
// Remove trailing dimensions of size 1 for y
auto
actual_dims_size
=
dims
->
size
();
auto
actual_dims_size
=
dims
.
size
();
for
(;
actual_dims_size
!=
0
;
--
actual_dims_size
)
{
for
(;
actual_dims_size
!=
0
;
--
actual_dims_size
)
{
if
(
(
*
dims
)
[
actual_dims_size
-
1
]
!=
1
)
break
;
if
(
dims
[
actual_dims_size
-
1
]
!=
1
)
break
;
}
}
if
(
actual_dims_size
!=
dims
->
size
())
{
auto
actual_dims
=
framework
::
vectorize
(
*
dims
);
std
::
vector
<
int
>
trim_dims
;
actual_dims
.
resize
(
actual_dims_size
);
trim_dims
.
resize
(
actual_dims_size
);
*
dims
=
framework
::
make_ddim
(
actual_dims
);
for
(
int
i
=
0
;
i
<
actual_dims_size
;
++
i
)
{
trim_dims
[
i
]
=
dims
[
i
];
}
}
framework
::
DDim
actual_dims
=
framework
::
make_ddim
(
trim_dims
);
return
actual_dims
;
}
}
template
<
typename
T
,
typename
DeviceContext
>
template
<
typename
T
,
typename
DeviceContext
>
...
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
...
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
#endif
#endif
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradComputeNoBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x_dim
));
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())});
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradComputeWithBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim_untrimed
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim_untrimed
.
size
()
:
axis
);
auto
y_dim
=
trim_trailing_singular_dims
(
y_dim_untrimed
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
if
(
post
==
1
)
{
int
h
=
pre
;
int
w
=
n
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast1CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
else
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast2CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
...
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
...
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const
framework
::
Tensor
&
dout
,
int
axis
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
framework
::
DDim
x_dim
=
x
.
dims
();
const
framework
::
DDim
y_dim
=
y
.
dims
();
if
(
x
.
dims
()
==
y
.
dims
())
{
if
(
x
.
dims
()
==
y
.
dims
())
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x
.
dims
()));
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())});
}
else
{
// Y is a scalar
}
else
{
// Y is a scalar
auto
x_dim
=
x
.
dims
();
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
auto
y_dim
=
y
.
dims
();
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
}
trim_trailing_singular_dims
(
&
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
int
pre
,
n
,
post
;
// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
// elementwise code.
if
(
post
==
1
)
{
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
int
h
=
pre
;
void
ElemwiseExplicitGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
int
w
=
n
;
const
framework
::
Tensor
&
x
,
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
const
framework
::
Tensor
&
y
,
#ifdef __NVCC__
const
framework
::
Tensor
&
out
,
ElemwiseGradBroadcast1CUDA
(
const
framework
::
Tensor
&
dout
,
int
axis
,
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
if
(
dy
==
nullptr
)
{
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
#endif
auto
dy_dims
=
dx_dims
;
}
else
{
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ElemwiseGradBroadcast1CPU
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
}
else
{
dx_op
,
dy_op
,
if
(
dout
.
dims
()
==
dy
->
dims
())
{
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
}
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
}
else
{
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
}
else
{
// Y is a scalar
#ifdef __NVCC__
auto
dx_dims
=
dout
.
dims
();
ElemwiseGradBroadcast2CUDA
(
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
}
}
}
}
// Deprecated
template
<
typename
DeviceContext
,
typename
T
,
typename
functor
,
template
<
typename
DeviceContext
,
typename
T
,
typename
functor
,
typename
broadcastfunctor
,
typename
broadcast2functor
>
typename
broadcastfunctor
,
typename
broadcast2functor
>
void
ElementwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
void
ElementwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
...
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
...
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
}
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
trim_trailing_singular_dims
(
&
y_dims
);
trim_trailing_singular_dims
(
y_dims
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
int
pre
,
n
,
post
;
...
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
...
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
auto
x_dims
=
x
->
dims
();
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
auto
y_dims
_untrimed
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
.
size
(),
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
_untrimed
.
size
(),
"Rank of first input must >= rank of second input."
);
"Rank of first input must >= rank of second input."
);
if
(
x_dims
==
y_dims
)
{
if
(
x_dims
==
y_dims
_untrimed
)
{
functor
.
Run
();
functor
.
Run
();
return
;
return
;
}
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
int
pre
,
n
,
post
;
...
...
paddle/fluid/operators/elementwise_sub_op.cc
浏览文件 @
a58dd3e5
...
@@ -15,7 +15,10 @@ limitations under the License. */
...
@@ -15,7 +15,10 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_sub
,
Sub
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
elementwise_sub
,
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_sub_op.h
浏览文件 @
a58dd3e5
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
...
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
SubGradDX
<
T
>
(),
SubGradDY
<
T
>
());
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
SubGradDX
<
T
>
(),
SubGradDY
<
T
>
());
}
}
};
};
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
a58dd3e5
...
@@ -19,12 +19,17 @@ limitations under the License. */
...
@@ -19,12 +19,17 @@ limitations under the License. */
#include <thread> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32
(
listen_and_serv_profile_period
,
0
,
"the period of listen_and_serv to do profile"
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop(
...
@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop(
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
rpc_service_
->
ResetBarrierCounter
();
rpc_service_
->
ResetBarrierCounter
();
int32_t
profile_step
=
0
;
while
(
true
)
{
while
(
true
)
{
PADDLE_ENFORCE_LE
(
profile_step
,
FLAGS_listen_and_serv_profile_period
,
"profile_step should not be larger then "
"FLAGS_listen_and_serv_profile_period"
);
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
0
)
{
auto
pf_state
=
paddle
::
platform
::
ProfilerState
::
kCPU
;
paddle
::
platform
::
EnableProfiler
(
pf_state
);
}
}
// Get from multiple trainers, we don't care about the order in which
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
...
@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
...
@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
// reset received sparse vars to avoid reuse it in the next mini-batch
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast
<
distributed
::
RequestSendHandler
*>
(
request_send_handler_
.
get
())
dynamic_cast
<
distributed
::
RequestSendHandler
*>
(
request_send_handler_
.
get
())
->
ResetSparseVarRecorder
();
->
ResetSparseVarRecorder
();
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
FLAGS_listen_and_serv_profile_period
)
{
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kTotal
,
"/dev/null"
);
profile_step
=
0
;
}
else
{
profile_step
++
;
}
}
}
// while(true)
}
// while(true)
}
}
...
...
paddle/fluid/operators/math/im2col.cc
浏览文件 @
a58dd3e5
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/im2col.h"
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
...
@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
// TODO(TJ): change me to template
// further optimaze:
// 1. padding != 1
// 2. could also support stride_h != 1
if
(
stride
[
0
]
==
1
&&
stride
[
1
]
==
1
&&
dilation
[
0
]
==
1
&&
if
(
stride
[
0
]
==
1
&&
stride
[
1
]
==
1
&&
dilation
[
0
]
==
1
&&
dilation
[
1
]
==
1
&&
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
dilation
[
1
]
==
1
)
{
int
col_matrix_width
=
output_width
*
output_height
;
if
(
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
im2col_sh1sw1dh1dw1ph0pw0
<
T
>
(
im
,
col
);
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
return
;
const
T
*
im_data_start
=
im_data
+
oh
*
im_width
;
}
else
if
(
padding
[
0
]
==
1
&&
padding
[
1
]
==
1
)
{
T
*
dst_data
=
col_data
+
oh
*
output_width
;
im2col_sh1sw1dh1dw1ph1pw1
<
T
>
(
im
,
col
);
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
return
;
const
T
*
src_data
=
im_data_start
+
ic
*
im_height
*
im_width
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
}
}
// TODO(TJ): complete padding >=2
}
}
im2col_common
<
T
>
(
im
,
dilation
,
stride
,
padding
,
col
);
}
}
};
};
...
...
paddle/fluid/operators/math/im2col_cfo_cpu.h
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/**
* The most common im2col algorithm.
* Support dilation, stride and padding.
*/
template
<
typename
T
>
inline
void
im2col_common
(
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
}
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 0
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph0pw0
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
col_matrix_width
=
output_width
*
output_height
;
int
im_size
=
im_height
*
im_width
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
const
T
*
im_data_oh
=
im_data
;
T
*
dst_data_oh
=
col_data
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
src_data_ic
=
im_data_oh
;
T
*
dst_data
=
dst_data_oh
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
src_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
src_data_ic
=
src_data_ic
+
im_size
;
}
im_data_oh
=
im_data_oh
+
im_width
;
dst_data_oh
=
dst_data_oh
+
output_width
;
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 1
* and filter_width == 1 have a special implementation
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph1pw1
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
constexpr
int
plh
=
1
;
constexpr
int
prh
=
1
;
constexpr
int
plw
=
1
;
constexpr
int
prw
=
1
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
im_size
=
im_height
*
im_width
;
int
col_matrix_width
=
output_width
*
output_height
;
int
col_block_fh
=
filter_width
*
col_matrix_width
;
// fw*oh*ow
int
col_block_ic
=
filter_height
*
col_block_fh
;
// fh*fw*oh*ow
// fill height padding
{
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
T
*
col_start_l
=
col_data
;
T
*
col_start_r
=
col_data
+
(
filter_height
-
1
)
*
col_block_fh
+
col_matrix_width
-
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_l
=
col_start_l
;
T
*
dst_data_r
=
col_start_r
;
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memset
(
dst_data_l
,
0
,
copy_size
);
std
::
memset
(
dst_data_r
,
0
,
copy_size
);
dst_data_l
=
dst_data_l
+
col_matrix_width
;
dst_data_r
=
dst_data_r
+
col_matrix_width
;
}
col_start_l
=
col_start_l
+
col_block_ic
;
col_start_r
=
col_start_r
+
col_block_ic
;
}
}
auto
pad
=
static_cast
<
T
>
(
0
);
if
(
filter_width
==
1
)
{
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
T
*
dst_data
=
dst_data_kh
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
-
1
;
*
dst_data
=
pad
;
++
dst_data
;
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// fill core
size_t
copy_size
=
sizeof
(
T
)
*
(
output_width
-
plw
-
prw
);
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
col_matrix_width
;
continue
;
}
std
::
memcpy
(
dst_data
+
plw
,
src_data
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
// filter_width != 1
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
T
*
dst_data
:
{
dst_data_kh
,
dst_data_kh
+
(
filter_width
-
prw
)
*
col_matrix_width
+
output_width
-
1
})
{
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
;
}
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
filter_width
*
col_matrix_width
;
continue
;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
std
::
memcpy
(
dst_data
+
(
plw
-
kw
),
src_data
,
sizeof
(
T
)
*
(
output_width
-
(
plw
-
kw
)));
dst_data
=
dst_data
+
col_matrix_width
;
}
for
(
int
kw
=
plw
;
kw
<
filter_width
-
prw
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
output_width
);
dst_data
=
dst_data
+
col_matrix_width
;
}
int
i
=
1
;
for
(
int
kw
=
filter_width
-
prw
;
kw
<
filter_width
;
++
kw
,
++
i
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
(
output_width
-
i
));
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
a58dd3e5
...
@@ -14,7 +14,9 @@ limitations under the License. */
...
@@ -14,7 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <sys/time.h>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
template
<
typename
DeviceContext
,
typename
Place
>
template
<
typename
DeviceContext
,
typename
Place
>
void
testIm2col
()
{
void
testIm2col
()
{
...
@@ -160,82 +162,111 @@ void testIm2col() {
...
@@ -160,82 +162,111 @@ void testIm2col() {
delete
context
;
delete
context
;
}
}
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
output
;
paddle
::
framework
::
Tensor
ref_output
;
std
::
vector
<
int
>
padding
({
ph
,
pw
});
std
::
vector
<
int
>
stride
({
1
,
1
});
// stride_y, stride_x
std
::
vector
<
int
>
dilation
({
1
,
1
});
// dilation_y, dilation_x
int
output_height
=
(
ih
-
fh
+
padding
[
0
]
*
2
)
/
stride
[
0
]
+
1
;
int
output_width
=
(
iw
-
fw
+
padding
[
1
]
*
2
)
/
stride
[
1
]
+
1
;
float
*
input_ptr
=
input
.
mutable_data
<
float
>
({
ic
,
ih
,
iw
},
paddle
::
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
input
.
numel
();
++
i
)
{
input_ptr
[
i
]
=
static_cast
<
float
>
(
i
+
1
);
}
paddle
::
platform
::
CPUPlace
place
;
paddle
::
platform
::
CPUDeviceContext
context
(
place
);
output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
ref_output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CPUDeviceContext
,
float
>
im2col
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
output
);
auto
ref_im2col
=
[
&
](
const
paddle
::
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
paddle
::
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
float
*
im_data
=
im
.
data
<
float
>
();
float
*
col_data
=
col
->
data
<
float
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
0.
f
:
im_data
[
im_idx
];
}
}
}
};
ref_im2col
(
input
,
dilation
,
stride
,
padding
,
&
ref_output
);
float
*
out_cfo_ptr
=
output
.
data
<
float
>
();
float
*
out_ref_ptr
=
ref_output
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
out_ref_ptr
[
i
]);
}
}
TEST
(
math
,
im2col
)
{
TEST
(
math
,
im2col
)
{
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
2
,
/*ph*/
0
,
/*pw*/
0
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
1
);
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
paddle
::
platform
::
CUDAPlace
>
();
#endif
#endif
}
}
#define PREPARE_IM2COL_CPU \
paddle::platform::CPUPlace place; \
paddle::platform::CPUDeviceContext context(place); \
paddle::framework::Tensor input; \
paddle::framework::Tensor out; \
paddle::framework::Tensor ref; \
std::vector<int> padding({ph, pw}); \
std::vector<int> stride({1, 1}); \
std::vector<int> dilation({1, 1}); \
float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place); \
for (int i = 0; i < input.numel(); ++i) { \
input_ptr[i] = static_cast<float>(i + 1); \
} \
int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \
int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \
out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
paddle::operators::math::Im2ColFunctor< \
paddle::operators::math::ColFormat::kCFO, \
paddle::platform::CPUDeviceContext, float> \
im2col
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
float
*
ref_data
=
ref
.
data
<
float
>
();
float
*
out_data
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_data
[
i
],
ref_data
[
i
]);
}
}
void
benchIm2col
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
constexpr
int
repeat
=
100
;
auto
GetCurrentMs
=
[]()
->
double
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+3
*
time
.
tv_sec
+
1e-3
*
time
.
tv_usec
;
};
auto
t1
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
}
auto
t2
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
}
auto
t3
=
GetCurrentMs
();
LOG
(
INFO
)
<<
"before: "
<<
(
t3
-
t2
)
/
repeat
<<
",after: "
<<
(
t2
-
t1
)
/
repeat
<<
",boost: "
<<
((
t3
-
t2
)
/
(
t2
-
t1
)
-
1
)
*
100
<<
"%"
;
}
TEST
(
math
,
im2col_cputest
)
{
// padding_h == padding_w
for
(
int
p
=
0
;
p
<
4
;
++
p
)
{
// width == height
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
4
,
/*fw*/
4
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
2
,
/*ph*/
p
,
/*pw*/
p
);
// height != width
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
// filter == 1
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
3
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
}
// padding_h != padding_w
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
2
);
// benchmark
for
(
int
p
:
{
0
,
1
})
{
for
(
int
k
:
{
1
,
3
,
5
})
{
LOG
(
INFO
)
<<
"padding == "
<<
p
<<
", filter == "
<<
k
;
benchIm2col
(
/*ic*/
3
,
/*ih*/
224
,
/*iw*/
224
,
/*fh*/
k
,
/*fw*/
k
,
/*ph*/
p
,
/*pw*/
p
);
}
}
}
paddle/fluid/operators/reshape_op.cc
浏览文件 @
a58dd3e5
...
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor). The output tensor of reshape operator."
);
AddOutput
(
"Out"
,
"(Tensor). The output tensor of reshape operator."
);
AddAttr
<
std
::
vector
<
int
>>
(
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
"(std::vector<int>) Target shape of reshape operator."
);
"shape"
,
"(std::vector<int>) Target shape of reshape operator."
);
AddAttr
<
bool
>
(
"inplace"
,
"(default: false) Change the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x)."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Reshape Operator.
Reshape Operator.
...
@@ -233,16 +227,9 @@ class ReshapeKernel {
...
@@ -233,16 +227,9 @@ class ReshapeKernel {
"sequence_reshape op."
);
"sequence_reshape op."
);
}
}
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
out
->
Resize
(
out_dims
);
}
}
}
};
};
...
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
...
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
in_dims
=
d_x
->
dims
();
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
framework
::
TensorCopySync
(
*
d_out
,
ctx
.
GetPlace
(),
d_x
);
d_x
->
Resize
(
in_dims
);
auto
in_dims
=
d_x
->
dims
();
if
(
!
inplace
)
{
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
ctx
.
device_context
().
Wait
();
d_x
->
Resize
(
in_dims
);
}
else
{
d_x
->
ShareDataWith
(
*
d_out
);
d_x
->
Resize
(
in_dims
);
}
}
}
};
};
...
...
paddle/fluid/operators/softmax_op.cc
浏览文件 @
a58dd3e5
...
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
...
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)),
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out) and its gradients should have a same shape."
);
"Input(Out) and its gradients should have a same shape."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)));
}
}
protected:
protected:
...
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
...
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
}
#endif
#endif
auto
input_data_type
=
auto
input_data_type
=
framework
::
ToDataType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP16
)
{
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP16
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"float16 can only be used on GPU place"
);
"float16 can only be used on GPU place"
);
...
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
...
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
}
}
};
};
class
SoftmaxOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"softmax_grad"
);
op
->
SetInput
(
"Out"
,
Output
(
"Out"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
softmax
,
ops
::
SoftmaxOp
,
ops
::
SoftmaxOpMaker
,
REGISTER_OPERATOR
(
softmax
,
ops
::
SoftmaxOp
,
ops
::
SoftmaxOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
ops
::
SoftmaxOpGradMaker
);
REGISTER_OPERATOR
(
softmax_grad
,
ops
::
SoftmaxOpGrad
);
REGISTER_OPERATOR
(
softmax_grad
,
ops
::
SoftmaxOpGrad
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
softmax
,
ops
::
SoftmaxKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/split_ids_op.h
浏览文件 @
a58dd3e5
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
...
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
...
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
const
auto
&
ids_rows
=
ids_selected_rows
->
rows
();
const
auto
&
ids_rows
=
ids_selected_rows
->
rows
();
auto
outs
=
ctx
.
MultiOutput
<
framework
::
SelectedRows
>
(
"Out"
);
auto
outs
=
ctx
.
MultiOutput
<
framework
::
SelectedRows
>
(
"Out"
);
const
size_t
shard_num
=
outs
.
size
();
const
size_t
shard_num
=
outs
.
size
();
for
(
auto
&
out
:
outs
)
{
out
->
mutable_rows
()
->
clear
();
}
// get rows for outputs
// get rows for outputs
for
(
auto
&
id
:
ids_rows
)
{
std
::
unordered_map
<
int64_t
,
size_t
>
id_to_index
;
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
for
(
size_t
i
=
0
;
i
<
ids_rows
.
size
();
++
i
)
{
outs
[
shard_id
]
->
mutable_rows
()
->
push_back
(
id
);
id_to_index
[
ids_rows
[
i
]]
=
i
;
size_t
shard_id
=
static_cast
<
size_t
>
(
ids_rows
[
i
])
%
shard_num
;
outs
[
shard_id
]
->
mutable_rows
()
->
push_back
(
ids_rows
[
i
]);
}
}
int64_t
row_width
=
ids_dims
[
1
];
int64_t
row_width
=
ids_dims
[
1
];
...
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
...
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
{
static_cast
<
int64_t
>
(
out
->
rows
().
size
()),
row_width
});
{
static_cast
<
int64_t
>
(
out
->
rows
().
size
()),
row_width
});
T
*
output
=
out
->
mutable_value
()
->
mutable_data
<
T
>
(
ddim
,
place
);
T
*
output
=
out
->
mutable_value
()
->
mutable_data
<
T
>
(
ddim
,
place
);
for
(
int64_t
i
=
0
;
i
<
ddim
[
0
];
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
ddim
[
0
];
++
i
)
{
memcpy
(
output
+
i
*
row_width
,
ids
+
out
->
rows
()[
i
]
*
row_width
,
memcpy
(
output
+
i
*
row_width
,
ids
+
id_to_index
[
out
->
rows
()[
i
]]
*
row_width
,
row_width
*
sizeof
(
T
));
row_width
*
sizeof
(
T
));
}
}
}
}
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
a58dd3e5
...
@@ -13,7 +13,6 @@
...
@@ -13,7 +13,6 @@
// limitations under the License.
// limitations under the License.
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include <iostream>
#include <random>
#include <random>
...
@@ -25,13 +24,13 @@
...
@@ -25,13 +24,13 @@
using
paddle
::
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
paddle
::
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
paddle
::
platform
::
float16
;
using
paddle
::
platform
::
float16
;
#define CUDA_ATOMIC_KERNEL(op, T) \
template
<
typename
T
>
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
__global__
void
AddKernel
(
const
T
*
data_a
,
T
*
data_b
,
size_t
num
)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i += blockDim.x * gridDim.x) { \
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \
paddle
::
platform
::
CudaAtomicAdd
(
&
data_b
[
i
],
data_a
[
i
]);
} \
}
}
}
template
<
typename
T
>
template
<
typename
T
>
struct
AddFunctor
{
struct
AddFunctor
{
...
@@ -39,80 +38,116 @@ struct AddFunctor {
...
@@ -39,80 +38,116 @@ struct AddFunctor {
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
SubFunctor
{
void
TestCase
(
size_t
num
)
{
T
operator
()(
const
T
&
a
,
const
T
&
b
)
{
return
a
-
b
;
}
T
*
in1
,
*
in2
,
*
out
;
};
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
// NOTE(dzhwinter): the float16 add has small underflow/overflow
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
// so we use EXPECT_NEAR to check the result.
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
void Test##T##op(size_t num) { \
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
T *in1, *in2, *out; \
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
T *d_in1, *d_in2; \
std
::
minstd_rand
engine
;
size_t size = sizeof(T) * num; \
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in1 = reinterpret_cast<T*>(malloc(size)); \
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
}
}
CUDA_ATOMIC_KERNEL
(
Add
,
float
);
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
CUDA_ATOMIC_KERNEL
(
Add
,
double
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
CUDA_ATOMIC_KERNEL
(
Add
,
float16
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float
);
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
double
);
cudaDeviceSynchronize
();
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float16
);
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
namespace
paddle
{
// so we use EXPECT_NEAR to check the result.
namespace
platform
{
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
USE_CUDA_ATOMIC
(
Sub
,
int
);
static_cast
<
float
>
(
AddFunctor
<
T
>
()(
in1
[
i
],
in2
[
i
])),
0.001
);
};
}
};
free
(
in1
);
CUDA_ATOMIC_KERNEL
(
Sub
,
int
);
free
(
in2
);
ARITHMETIC_KERNEL_LAUNCH
(
Sub
,
int
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
// cuda primitives
// cuda primitives
TEST
(
CudaAtomic
,
Add
)
{
TEST
(
CudaAtomic
,
Add
)
{
TestfloatAdd
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float
>
(
static_cast
<
size_t
>
(
10
));
TestfloatAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestCase
<
float
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
10
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TEST
(
CudaAtomic
,
Sub
)
{
TestCase
<
double
>
(
static_cast
<
size_t
>
(
10
));
TestintSub
(
static_cast
<
size_t
>
(
10
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestintSub
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
}
TEST
(
CudaAtomic
,
float16
)
{
TEST
(
CudaAtomic
,
float16
)
{
using
paddle
::
platform
::
float16
;
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1
));
Testfloat16Add
(
static_cast
<
size_t
>
(
1
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
2
));
Testfloat16Add
(
static_cast
<
size_t
>
(
2
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
3
));
Testfloat16Add
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
// unalignment of uint8
void
TestUnalign
(
size_t
num
,
const
int
shift_bit
)
{
PADDLE_ENFORCE
(
num
%
2
==
0
,
"must be a multiple of 2"
);
float16
*
in1
,
*
in2
,
*
out
;
float16
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
// right shift 1, mimic the unalignment of address
float16
*
r_in1
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in1
)
+
shift_bit
);
float16
*
r_in2
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in2
)
+
shift_bit
);
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
float16
>
()(
r_in1
[
i
],
r_in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
// same with float16 testcase
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
2
);
// shift the address.
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
1
);
Testfloat16Add
(
static_cast
<
size_t
>
(
10
));
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
3
);
Testfloat16Add
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
3
);
}
}
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
a58dd3e5
...
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
...
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic.
// convert the value into float and do the add arithmetic.
// then store the result into a uint32.
// then store the result into a uint32.
inline
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
float16
low_half
;
float16
low_half
;
// the float16 in lower 16bits
// the float16 in lower 16bits
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
ffff
u
);
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
FFFF
u
);
low_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
low_half
)
+
x
);
low_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
low_half
)
+
x
);
return
(
val
&
0x
ffff
0000u
)
|
low_half
.
x
;
return
(
val
&
0x
FFFF
0000u
)
|
low_half
.
x
;
}
}
inline
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
float16
high_half
;
float16
high_half
;
// the float16 in higher 16bits
// the float16 in higher 16bits
high_half
.
x
=
static_cast
<
uint16_t
>
(
val
>>
16
);
high_half
.
x
=
static_cast
<
uint16_t
>
(
val
>>
16
);
high_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
high_half
)
+
x
);
high_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
high_half
)
+
x
);
return
(
val
&
0x
ffff
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
return
(
val
&
0x
FFFF
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
}
}
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
// concrete packed float16 value may exsits in lower or higher 16bits
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
// of the 32bits address.
uint32_t
*
address_as_ui
=
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
size_t
>
(
address
)
&
2
));
(
reinterpret_cast
<
uintptr_t
>
(
address
)
&
0x0
2
));
float
val_f
=
static_cast
<
float
>
(
val
);
float
val_f
=
static_cast
<
float
>
(
val
);
uint32_t
old
=
*
address_as_ui
;
uint32_t
old
=
*
address_as_ui
;
uint32_t
sum
;
uint32_t
sum
;
uint32_t
newval
;
uint32_t
newval
;
uint32_t
assumed
;
uint32_t
assumed
;
if
(((
size_t
)
address
&
2
)
==
0
)
{
if
(((
uintptr_t
)
address
&
0x0
2
)
==
0
)
{
// the float16 value stay at lower 16 bits of the address.
// the float16 value stay at lower 16 bits of the address.
do
{
do
{
assumed
=
old
;
assumed
=
old
;
old
=
atomicCAS
(
address_as_ui
,
assumed
,
add_to_low_half
(
assumed
,
val_f
));
old
=
atomicCAS
(
address_as_ui
,
assumed
,
add_to_low_half
(
assumed
,
val_f
));
}
while
(
old
!=
assumed
);
}
while
(
old
!=
assumed
);
float16
ret
;
float16
ret
;
ret
.
x
=
old
&
0x
ffff
u
;
ret
.
x
=
old
&
0x
FFFF
u
;
return
ret
;
return
ret
;
}
else
{
}
else
{
// the float16 value stay at higher 16 bits of the address.
// the float16 value stay at higher 16 bits of the address.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
a58dd3e5
...
@@ -534,7 +534,7 @@ EOF
...
@@ -534,7 +534,7 @@ EOF
make
-j
`
nproc
`
inference_lib_dist
make
-j
`
nproc
`
inference_lib_dist
cd
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
cp
-r
fluid_install_dir fluid
cp
-r
fluid_install_dir fluid
tar
-cf
fluid.tgz fluid
tar
-c
z
f
fluid.tgz fluid
fi
fi
}
}
...
...
python/paddle/batch.py
浏览文件 @
a58dd3e5
...
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
...
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
if
drop_last
==
False
and
len
(
b
)
!=
0
:
if
drop_last
==
False
and
len
(
b
)
!=
0
:
yield
b
yield
b
# Batch size check
batch_size
=
int
(
batch_size
)
if
batch_size
<=
0
:
raise
ValueError
(
"batch_size should be a positive integeral value, "
"but got batch_size={}"
.
format
(
batch_size
))
return
batch_reader
return
batch_reader
python/paddle/fluid/__init__.py
浏览文件 @
a58dd3e5
...
@@ -127,6 +127,7 @@ def __bootstrap__():
...
@@ -127,6 +127,7 @@ def __bootstrap__():
]
]
if
core
.
is_compiled_with_dist
():
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'listen_and_serv_profile_period'
)
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
read_env_flags
+=
[
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
a58dd3e5
...
@@ -38,6 +38,7 @@ __all__ = [
...
@@ -38,6 +38,7 @@ __all__ = [
__auto__
=
[
__auto__
=
[
'iou_similarity'
,
'iou_similarity'
,
'box_coder'
,
'box_coder'
,
'polygon_box_transform'
,
]
]
__all__
+=
__auto__
__all__
+=
__auto__
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a58dd3e5
...
@@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
...
@@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension."
)
"except one unknown dimension."
)
helper
=
LayerHelper
(
"reshape"
,
**
locals
())
helper
=
LayerHelper
(
"reshape"
,
**
locals
())
reshaped
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
helper
.
append_op
(
type
=
"reshape"
,
type
=
"reshape"
,
inputs
=
inputs
,
inputs
=
inputs
,
attrs
=
{
"shape"
:
shape
,
attrs
=
{
"shape"
:
shape
},
"inplace"
:
inplace
},
outputs
=
{
"Out"
:
out
})
outputs
=
{
"Out"
:
reshaped
})
return
helper
.
append_activation
(
reshaped
)
return
helper
.
append_activation
(
out
)
def
lod_reset
(
x
,
y
=
None
,
target_lod
=
None
):
def
lod_reset
(
x
,
y
=
None
,
target_lod
=
None
):
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
a58dd3e5
...
@@ -66,9 +66,7 @@ __all__ = [
...
@@ -66,9 +66,7 @@ __all__ = [
'scatter'
,
'scatter'
,
'sum'
,
'sum'
,
'slice'
,
'slice'
,
'polygon_box_transform'
,
'shape'
,
'shape'
,
'iou_similarity'
,
'maxout'
,
'maxout'
,
]
+
__activations__
]
+
__activations__
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
a58dd3e5
...
@@ -122,7 +122,7 @@ class ParallelExecutor(object):
...
@@ -122,7 +122,7 @@ class ParallelExecutor(object):
else
:
else
:
cpu_num
=
int
(
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
exec_strategy
.
num_threads
=
cpu_num
*
2
if
build_strategy
is
None
:
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
build_strategy
=
BuildStrategy
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a58dd3e5
...
@@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE)
...
@@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE)
endif
()
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext SERIAL
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
python/paddle/fluid/tests/unittests/dist_transformer.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
os
import
sys
import
transformer_model
import
paddle.dataset.wmt16
as
wmt16
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
WMT16_RECORDIO_FILE
=
"/tmp/wmt16.recordio"
class
ModelHyperParams
(
object
):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size
=
10000
# index for <pad> token in source language.
src_pad_idx
=
src_vocab_size
# size of target word dictionay
trg_vocab_size
=
10000
# index for <pad> token in target language.
trg_pad_idx
=
trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx
=
0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length
=
50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model
=
512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid
=
1024
# the dimension that keys are projected to for dot-product attention.
d_key
=
64
# the dimension that values are projected to for dot-product attention.
d_value
=
64
# number of head used in multi-head attention.
n_head
=
8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer
=
6
# dropout rate used by all dropout layers.
dropout
=
0.1
def
prepare_batch_input
(
insts
,
src_pad_idx
,
trg_pad_idx
,
n_head
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def
__pad_batch_data
(
insts
,
pad_idx
,
is_target
=
False
,
return_pos
=
True
,
return_attn_bias
=
True
,
return_max_len
=
True
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
inst_data
=
np
.
array
(
[
inst
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_pos
:
inst_pos
=
np
.
array
([[
pos_i
+
1
if
w_i
!=
pad_idx
else
0
for
pos_i
,
w_i
in
enumerate
(
inst
)
]
for
inst
in
inst_data
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_attn_bias
:
if
is_target
:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data
=
np
.
ones
((
inst_data
.
shape
[
0
],
max_len
,
max_len
))
slf_attn_bias_data
=
np
.
triu
(
slf_attn_bias_data
,
1
).
reshape
(
[
-
1
,
1
,
max_len
,
max_len
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
,
[
1
,
n_head
,
1
,
1
])
*
[
-
1e9
]
else
:
# This is used to avoid attention on paddings.
slf_attn_bias_data
=
np
.
array
([[
0
]
*
len
(
inst
)
+
[
-
1e9
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
.
reshape
([
-
1
,
1
,
1
,
max_len
]),
[
1
,
n_head
,
max_len
,
1
])
return_list
+=
[
slf_attn_bias_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
src_word
,
src_pos
,
src_slf_attn_bias
,
src_max_len
=
__pad_batch_data
(
[
inst
[
0
]
for
inst
in
insts
],
src_pad_idx
,
is_target
=
False
)
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_max_len
=
__pad_batch_data
(
[
inst
[
1
]
for
inst
in
insts
],
trg_pad_idx
,
is_target
=
True
)
trg_src_attn_bias
=
np
.
tile
(
src_slf_attn_bias
[:,
:,
::
src_max_len
,
:],
[
1
,
1
,
trg_max_len
,
1
]).
astype
(
"float32"
)
lbl_word
=
__pad_batch_data
([
inst
[
2
]
for
inst
in
insts
],
trg_pad_idx
,
False
,
False
,
False
,
False
)
lbl_weight
=
(
lbl_word
!=
trg_pad_idx
).
astype
(
"float32"
).
reshape
([
-
1
,
1
])
return
[
src_word
,
src_pos
,
trg_word
,
trg_pos
,
src_slf_attn_bias
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
lbl_word
,
lbl_weight
]
def
transformer
(
use_feed
):
assert
not
use_feed
,
"transfomer doesn't support feed yet"
return
transformer_model
.
transformer
(
ModelHyperParams
.
src_vocab_size
+
1
,
ModelHyperParams
.
trg_vocab_size
+
1
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
dropout
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
pos_pad_idx
)
def
get_model
():
avg_cost
=
transformer
(
use_feed
=
False
)
optimizer
=
fluid
.
optimizer
.
Adam
()
optimizer
.
minimize
(
avg_cost
)
return
avg_cost
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
=
trainer_id
,
program
=
main_program
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
return
t
class
DistTransformer2x2
(
object
):
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
trainer_id
):
get_model
()
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
exe
.
run
(
pserver_prog
)
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
print
(
"waiting ps ready: "
,
pid
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
run_trainer
(
self
,
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
):
avg_cost
=
get_model
()
if
is_dist
:
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
endpoints
,
trainers
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
exe
=
fluid
.
ParallelExecutor
(
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
)
first_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
first_loss
)
for
i
in
xrange
(
5
):
_
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
last_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
last_loss
)
def
main
(
role
=
"pserver"
,
endpoints
=
"127.0.0.1:9123"
,
trainer_id
=
0
,
current_endpoint
=
"127.0.0.1:9123"
,
trainers
=
1
,
is_dist
=
True
):
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
batch_size
=
transformer_model
.
batch_size
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
WMT16_RECORDIO_FILE
)
as
writer
:
for
batch
in
reader
():
for
tensor
in
prepare_batch_input
(
batch
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
n_head
):
t
=
fluid
.
LoDTensor
()
t
.
set
(
tensor
,
fluid
.
CPUPlace
())
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
model
=
DistTransformer2x2
()
if
role
==
"pserver"
:
model
.
run_pserver
(
endpoints
,
trainers
,
current_endpoint
,
trainer_id
)
else
:
p
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model
.
run_trainer
(
p
,
endpoints
,
trainer_id
,
trainers
,
is_dist
)
if
__name__
==
"__main__"
:
if
len
(
sys
.
argv
)
!=
7
:
print
(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role
=
sys
.
argv
[
1
]
endpoints
=
sys
.
argv
[
2
]
trainer_id
=
int
(
sys
.
argv
[
3
])
current_endpoint
=
sys
.
argv
[
4
]
trainers
=
int
(
sys
.
argv
[
5
])
is_dist
=
True
if
sys
.
argv
[
6
]
==
"TRUE"
else
False
main
(
role
=
role
,
endpoints
=
endpoints
,
trainer_id
=
trainer_id
,
current_endpoint
=
current_endpoint
,
trainers
=
trainers
,
is_dist
=
is_dist
)
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
a58dd3e5
...
@@ -68,6 +68,10 @@ def get_numeric_gradient(place,
...
@@ -68,6 +68,10 @@ def get_numeric_gradient(place,
tensor_to_check_dtype
=
np
.
float32
tensor_to_check_dtype
=
np
.
float32
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP64
:
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP64
:
tensor_to_check_dtype
=
np
.
float64
tensor_to_check_dtype
=
np
.
float64
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
tensor_to_check_dtype
=
np
.
float16
# set delta as np.float16, will automatic convert to float32, float64
delta
=
np
.
array
(
delta
).
astype
(
np
.
float16
)
else
:
else
:
raise
ValueError
(
"Not supported data type "
+
str
(
raise
ValueError
(
"Not supported data type "
+
str
(
tensor_to_check_dtype
))
tensor_to_check_dtype
))
...
@@ -75,13 +79,24 @@ def get_numeric_gradient(place,
...
@@ -75,13 +79,24 @@ def get_numeric_gradient(place,
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
def
__get_elem__
(
tensor
,
i
):
def
__get_elem__
(
tensor
,
i
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
numpy_tensor
=
numpy_tensor
.
flatten
()
return
numpy_tensor
[
i
]
elif
tensor_to_check_dtype
==
np
.
float32
:
return
tensor
.
_get_float_element
(
i
)
return
tensor
.
_get_float_element
(
i
)
else
:
else
:
return
tensor
.
_get_double_element
(
i
)
return
tensor
.
_get_double_element
(
i
)
def
__set_elem__
(
tensor
,
i
,
e
):
def
__set_elem__
(
tensor
,
i
,
e
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
shape
=
numpy_tensor
.
shape
numpy_tensor
=
numpy_tensor
.
flatten
()
numpy_tensor
[
i
]
=
e
numpy_tensor
=
numpy_tensor
.
reshape
(
shape
).
view
(
np
.
uint16
)
tensor
.
set
(
numpy_tensor
,
place
)
elif
tensor_to_check_dtype
==
np
.
float32
:
tensor
.
_set_float_element
(
i
,
e
)
tensor
.
_set_float_element
(
i
,
e
)
else
:
else
:
tensor
.
_set_double_element
(
i
,
e
)
tensor
.
_set_double_element
(
i
,
e
)
...
@@ -135,6 +150,11 @@ class OpTest(unittest.TestCase):
...
@@ -135,6 +150,11 @@ class OpTest(unittest.TestCase):
if
not
self
.
call_once
:
if
not
self
.
call_once
:
self
.
call_once
=
True
self
.
call_once
=
True
self
.
dtype
=
data_type
self
.
dtype
=
data_type
# See the comment of np_dtype_to_fluid_dtype
# If the input type is uint16, we assume use float16
# for lodtensor dtype.
if
self
.
dtype
==
np
.
uint16
:
self
.
dtype
==
np
.
float16
def
infer_dtype_from_inputs_outputs
(
self
,
inputs
,
outputs
):
def
infer_dtype_from_inputs_outputs
(
self
,
inputs
,
outputs
):
def
infer_dtype
(
numpy_dict
):
def
infer_dtype
(
numpy_dict
):
...
@@ -163,19 +183,25 @@ class OpTest(unittest.TestCase):
...
@@ -163,19 +183,25 @@ class OpTest(unittest.TestCase):
for
name
,
np_value
in
self
.
inputs
[
var_name
]:
for
name
,
np_value
in
self
.
inputs
[
var_name
]:
tensor
=
core
.
LoDTensor
()
tensor
=
core
.
LoDTensor
()
if
isinstance
(
np_value
,
tuple
):
if
isinstance
(
np_value
,
tuple
):
tensor
.
set
(
np_value
[
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
[
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
np_value
[
1
])
tensor
.
set_recursive_sequence_lengths
(
np_value
[
1
])
else
:
else
:
tensor
.
set
(
np_value
,
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
),
place
)
feed_map
[
name
]
=
tensor
feed_map
[
name
]
=
tensor
else
:
else
:
tensor
=
core
.
LoDTensor
()
tensor
=
core
.
LoDTensor
()
if
isinstance
(
self
.
inputs
[
var_name
],
tuple
):
if
isinstance
(
self
.
inputs
[
var_name
],
tuple
):
tensor
.
set
(
self
.
inputs
[
var_name
][
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
][
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
self
.
inputs
[
var_name
][
tensor
.
set_recursive_sequence_lengths
(
self
.
inputs
[
var_name
][
1
])
1
])
else
:
else
:
tensor
.
set
(
self
.
inputs
[
var_name
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
]),
place
)
feed_map
[
var_name
]
=
tensor
feed_map
[
var_name
]
=
tensor
return
feed_map
return
feed_map
...
@@ -309,13 +335,22 @@ class OpTest(unittest.TestCase):
...
@@ -309,13 +335,22 @@ class OpTest(unittest.TestCase):
np
.
allclose
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
),
actual_t
,
expect_t
,
atol
=
atol
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
str
(
actual_t
)
+
"
\n
"
+
str
(
expect_t
))
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
))
if
isinstance
(
expect
,
tuple
):
if
isinstance
(
expect
,
tuple
):
self
.
assertListEqual
(
actual
.
recursive_sequence_lengths
(),
self
.
assertListEqual
(
actual
.
recursive_sequence_lengths
(),
expect
[
1
],
"Output ("
+
out_name
+
expect
[
1
],
"Output ("
+
out_name
+
") has different lod at "
+
str
(
place
))
") has different lod at "
+
str
(
place
))
def
_get_places
(
self
):
def
_get_places
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
return
[
place
]
else
:
return
[]
places
=
[
fluid
.
CPUPlace
()]
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
places
.
append
(
core
.
CUDAPlace
(
0
))
...
@@ -346,9 +381,9 @@ class OpTest(unittest.TestCase):
...
@@ -346,9 +381,9 @@ class OpTest(unittest.TestCase):
def
err_msg
():
def
err_msg
():
offset
=
np
.
argmax
(
diff_mat
>
max_relative_error
)
offset
=
np
.
argmax
(
diff_mat
>
max_relative_error
)
return
(
"%s Variable %s max gradient diff %f over limit %f, "
return
(
"%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d,
%f, %f"
)
%
(
"the first error element is %d,
expected %f, but got %f"
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
)
%
(
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
offset
,
a
.
flatten
()[
offset
],
b
.
flatten
()[
offset
])
offset
,
a
.
flatten
()[
offset
],
b
.
flatten
()[
offset
])
self
.
assertLessEqual
(
max_diff
,
max_relative_error
,
err_msg
())
self
.
assertLessEqual
(
max_diff
,
max_relative_error
,
err_msg
())
...
@@ -437,6 +472,21 @@ class OpTest(unittest.TestCase):
...
@@ -437,6 +472,21 @@ class OpTest(unittest.TestCase):
input
.
dtype
=
np
.
uint16
input
.
dtype
=
np
.
uint16
return
input
return
input
@
staticmethod
def
fluid_dtype_to_np_dtype
(
self
,
dtype
):
"""
See above, convert the dtype to normal type.
"""
if
dtype
==
np
.
uint16
:
dtype
=
np
.
float16
return
dtype
@
staticmethod
def
np_value_to_fluid_value
(
input
):
if
input
.
dtype
==
np
.
float16
:
input
=
input
.
view
(
np
.
uint16
)
return
input
def
_get_gradient
(
self
,
def
_get_gradient
(
self
,
input_to_check
,
input_to_check
,
place
,
place
,
...
@@ -459,7 +509,7 @@ class OpTest(unittest.TestCase):
...
@@ -459,7 +509,7 @@ class OpTest(unittest.TestCase):
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
use_cuda
=
True
use_cuda
=
True
executor
=
fluid
.
ParallelExecutor
(
executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
ram
)
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
)
else
:
else
:
executor
=
Executor
(
place
)
executor
=
Executor
(
place
)
return
list
(
return
list
(
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
unittest
import
os
import
sys
import
signal
import
subprocess
class
TestDistBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
,
model_file
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
as
e
:
sys
.
stderr
.
write
(
'waiting for pserver: %s, left retry %d
\n
'
%
(
e
,
retry_times
))
retry_times
-=
1
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s %s trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
model_file
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
(
model_file
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s %s trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s %s trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
,
delta
=
delta
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
,
delta
=
delta
)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
a58dd3e5
...
@@ -11,127 +11,14 @@
...
@@ -11,127 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
unittest
import
unittest
import
os
from
test_dist_base
import
TestDistBase
import
sys
import
signal
import
subprocess
class
TestDistSeResneXt2x2
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
test_with_place
(
self
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
()
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s dist_se_resnext.py trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
)
# check tr0_out
class
TestDistSeResneXt2x2
(
TestDistBase
):
# FIXME: ensure the server process is killed
def
test_se_resnext
(
self
):
# replace with ps0.terminate()
# TODO(paddle-dev): Is the delta too large?
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
0.2
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_transformer.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistTransformer2x2
(
TestDistBase
):
def
test_transformer
(
self
):
# TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
400
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
浏览文件 @
a58dd3e5
...
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
...
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"elementwise_sub"
self
.
op_type
=
"elementwise_sub"
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
),
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
)
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
)
}
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
-
self
.
inputs
[
'Y'
]}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
-
self
.
inputs
[
'Y'
]}
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
a58dd3e5
...
@@ -17,6 +17,8 @@ import numpy as np
...
@@ -17,6 +17,8 @@ import numpy as np
import
math
import
math
from
op_test
import
OpTest
from
op_test
import
OpTest
np
.
random
.
seed
(
100
)
def
find_latest_set
(
num
):
def
find_latest_set
(
num
):
return
1
+
int
(
math
.
floor
(
math
.
log
(
num
,
2
)))
return
1
+
int
(
math
.
floor
(
math
.
log
(
num
,
2
)))
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
a58dd3e5
...
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
...
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
print
(
str
(
program
))
def
test_shape
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
3
,
100
,
100
],
dtype
=
"float32"
)
out
=
layers
.
shape
(
input
,
name
=
"shape"
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
浏览文件 @
a58dd3e5
...
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
...
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
print
(
str
(
result_program
))
print
(
str
(
result_program
))
class
TestMemoryTranspiler2
(
unittest
.
TestCase
):
def
setUp
(
self
):
program
=
Program
()
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
13
],
dtype
=
'float32'
)
fc
=
layers
.
fc
(
input
=
x
,
size
=
10
,
act
=
None
)
reshape
=
layers
.
reshape
(
x
=
fc
,
shape
=
[
-
1
,
2
,
5
])
fc
=
layers
.
reshape
(
x
=
reshape
,
shape
=
[
-
1
,
5
,
2
])
y_predict
=
layers
.
fc
(
input
=
fc
,
size
=
1
,
act
=
None
)
y
=
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
layers
.
mean
(
cost
)
opt
=
optimizer
.
SGD
(
learning_rate
=
0.001
)
opt
.
minimize
(
avg_cost
)
self
.
program
=
program
def
test_inplace_ops
(
self
):
print
(
"before optimization"
)
print
(
str
(
self
.
program
))
result_program
=
memory_optimize
(
self
.
program
)
print
(
"after optimization"
)
print
(
str
(
result_program
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
a58dd3e5
...
@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
self
.
check_batchnorm_fc_convergence
(
False
)
self
.
check_batchnorm_fc_convergence
(
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
)
# FIXME(zcd): close this test temporally.
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
a58dd3e5
...
@@ -21,7 +21,7 @@ import paddle
...
@@ -21,7 +21,7 @@ import paddle
import
paddle.dataset.wmt16
as
wmt16
import
paddle.dataset.wmt16
as
wmt16
import
os
import
os
WMT16_RECORDIO_FILE
=
"
./wmt16_test_pe
.recordio"
WMT16_RECORDIO_FILE
=
"
/tmp/wmt16
.recordio"
class
ModelHyperParams
(
object
):
class
ModelHyperParams
(
object
):
...
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
...
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
writer
.
append_tensor
(
t
)
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
writer
.
complete_append_tensor
()
@
unittest
.
skip
(
"transformer is buggy in multi gpu"
)
def
test_main
(
self
):
def
test_main
(
self
):
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
,
iter
=
5
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_reshape_op.py
浏览文件 @
a58dd3e5
...
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
...
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
self
.
op_type
=
"reshape"
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
new_shape
)}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
new_shape
)}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
...
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
...
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
self
.
op_type
=
"reshape"
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
self
.
attrs
[
"shape"
])}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
self
.
attrs
[
"shape"
])}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
...
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
...
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
self
.
op_type
=
"reshape"
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
infered_shape
)}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
infered_shape
)}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_split_ids_op.py
浏览文件 @
a58dd3e5
...
@@ -15,6 +15,8 @@
...
@@ -15,6 +15,8 @@
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
from
op_test
import
OpTest
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
class
TestSplitIdsOp
(
OpTest
):
class
TestSplitIdsOp
(
OpTest
):
...
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
...
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
self
.
check_output
()
self
.
check_output
()
class
TestSpliteIds
(
unittest
.
TestCase
):
def
get_places
(
self
):
places
=
[
core
.
CPUPlace
()]
return
places
def
test_check_output
(
self
):
for
place
in
self
.
get_places
():
self
.
check_with_place
(
place
)
def
check_with_place
(
self
,
place
):
scope
=
core
.
Scope
()
rows
=
[
0
,
5
,
7
,
4
,
9
]
height
=
20
row_numel
=
2
# initialize input variable X
x
=
scope
.
var
(
'X'
).
get_selected_rows
()
x
.
set_rows
(
rows
)
x
.
set_height
(
height
)
np_array
=
np
.
ones
((
len
(
rows
),
row_numel
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
rows
)):
for
j
in
range
(
row_numel
):
np_array
[
i
,
j
]
=
rows
[
i
]
+
j
x_tensor
=
x
.
get_tensor
()
x_tensor
.
set
(
np_array
,
place
)
outs_name
=
[
"out%d"
%
i
for
i
in
xrange
(
3
)]
outs
=
[
scope
.
var
(
var_name
).
get_selected_rows
()
for
var_name
in
outs_name
]
# expected output selected rows
expected_out_rows
=
[[
0
,
9
],
[
7
,
4
],
[
5
]]
op
=
Operator
(
"split_ids"
,
Ids
=
"X"
,
Out
=
outs_name
)
for
_
in
range
(
3
):
op
.
run
(
scope
,
place
)
for
i
in
range
(
len
(
outs
)):
expected_rows
=
expected_out_rows
[
i
]
self
.
assertEqual
(
outs
[
i
].
rows
(),
expected_rows
)
for
j
in
range
(
len
(
expected_rows
)):
row
=
expected_rows
[
j
]
self
.
assertAlmostEqual
(
float
(
row
),
np
.
array
(
outs
[
i
].
get_tensor
())[
j
,
0
])
self
.
assertAlmostEqual
(
float
(
row
+
1
),
np
.
array
(
outs
[
i
].
get_tensor
())[
j
,
1
])
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
a58dd3e5
...
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
...
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.op
import
Operator
def
as_lodtensor
(
np_array
,
lod
,
place
):
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
np_value
,
place
)
if
lod
is
not
None
:
tensor
.
set_recursive_sequence_lengths
(
lod
)
return
tensor
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
kwargs
=
dict
()
kwargs
=
dict
()
...
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
...
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
def
set_input
(
scope
,
op
,
inputs
,
place
):
def
set_input
(
scope
,
op
,
inputs
,
place
):
def
np_value_to_fluid_value
(
input
):
if
input
.
dtype
==
np
.
float16
:
input
=
input
.
view
(
np
.
uint16
)
return
input
def
__set_input__
(
var_name
,
var
):
def
__set_input__
(
var_name
,
var
):
if
isinstance
(
var
,
tuple
)
or
isinstance
(
var
,
np
.
ndarray
):
if
isinstance
(
var
,
tuple
)
or
isinstance
(
var
,
np
.
ndarray
):
tensor
=
scope
.
find_var
(
var_name
).
get_tensor
()
tensor
=
scope
.
find_var
(
var_name
).
get_tensor
()
...
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
...
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
tensor
.
set_recursive_sequence_lengths
(
var
[
1
])
tensor
.
set_recursive_sequence_lengths
(
var
[
1
])
var
=
var
[
0
]
var
=
var
[
0
]
tensor
.
_set_dims
(
var
.
shape
)
tensor
.
_set_dims
(
var
.
shape
)
tensor
.
set
(
var
,
place
)
tensor
.
set
(
np_value_to_fluid_value
(
var
)
,
place
)
elif
isinstance
(
var
,
float
):
elif
isinstance
(
var
,
float
):
scope
.
find_var
(
var_name
).
set_float
(
var
)
scope
.
find_var
(
var_name
).
set_float
(
var
)
elif
isinstance
(
var
,
int
):
elif
isinstance
(
var
,
int
):
...
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
...
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if
name
not
in
np_list
:
if
name
not
in
np_list
:
assert
var_proto
.
intermediate
,
"{} not found"
.
format
(
name
)
assert
var_proto
.
intermediate
,
"{} not found"
.
format
(
name
)
else
:
else
:
# inferece the dtype from numpy value.
np_value
=
np_list
[
name
]
np_value
=
np_list
[
name
]
if
isinstance
(
np_value
,
tuple
):
if
isinstance
(
np_value
,
tuple
):
dtype
=
np_value
[
0
].
dtype
dtype
=
np_value
[
0
].
dtype
...
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
...
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if
is_input
:
if
is_input
:
shape
=
list
(
np_value
.
shape
)
shape
=
list
(
np_value
.
shape
)
lod_level
=
0
lod_level
=
0
# NOTE(dzhwinter): type hacking
# numpy float16 is binded to paddle::platform::float16
# in tensor_py.h via the help of uint16 datatype. Because
# the internal memory representation of float16 is
# actually uint16_t in paddle. So we use np.uint16 in numpy for
# raw memory, it can pass through the pybind. So in the testcase,
# we feed data use data.view(uint16), but the dtype is float16 in fact.
# The data.view(uint16) means do not cast the data type, but process data as the uint16
if
dtype
==
np
.
uint16
:
dtype
=
np
.
float16
return
block
.
create_var
(
return
block
.
create_var
(
dtype
=
dtype
,
shape
=
shape
,
lod_level
=
lod_level
,
name
=
name
)
dtype
=
dtype
,
shape
=
shape
,
lod_level
=
lod_level
,
name
=
name
)
...
...
python/paddle/fluid/tests/unittests/transformer_model.py
浏览文件 @
a58dd3e5
...
@@ -404,7 +404,7 @@ def transformer(
...
@@ -404,7 +404,7 @@ def transformer(
trg_pad_idx
,
trg_pad_idx
,
pos_pad_idx
,
):
pos_pad_idx
,
):
file_obj
=
fluid
.
layers
.
open_recordio_file
(
file_obj
=
fluid
.
layers
.
open_recordio_file
(
filename
=
'
.
/wmt16.recordio'
,
filename
=
'
/tmp
/wmt16.recordio'
,
shapes
=
[
shapes
=
[
[
batch_size
*
max_length
,
1
],
[
batch_size
*
max_length
,
1
],
[
batch_size
*
max_length
,
1
],
[
batch_size
*
max_length
,
1
],
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
a58dd3e5
...
@@ -494,6 +494,7 @@ class DistributeTranspiler(object):
...
@@ -494,6 +494,7 @@ class DistributeTranspiler(object):
pserver_index
=
self
.
pserver_endpoints
.
index
(
endpoint
)
pserver_index
=
self
.
pserver_endpoints
.
index
(
endpoint
)
table_opt_block
=
self
.
_create_table_optimize_block
(
table_opt_block
=
self
.
_create_table_optimize_block
(
pserver_index
,
pserver_program
,
pre_block_idx
,
grad_to_block_id
)
pserver_index
,
pserver_program
,
pre_block_idx
,
grad_to_block_id
)
optimize_blocks
.
append
(
table_opt_block
)
prefetch_var_name_to_block_id
=
self
.
_create_prefetch_block
(
prefetch_var_name_to_block_id
=
self
.
_create_prefetch_block
(
pserver_index
,
pserver_program
,
table_opt_block
)
pserver_index
,
pserver_program
,
table_opt_block
)
checkpoint_block_id
=
self
.
_create_checkpoint_save_block
(
checkpoint_block_id
=
self
.
_create_checkpoint_save_block
(
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
a58dd3e5
...
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
...
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
graphviz
COPY build_scripts /build_scripts
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
bash build_scripts/install_nccl2.sh && rm -r build_scripts
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录