Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
9f1927bd
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9f1927bd
编写于
4月 12, 2019
作者:
Z
Zhaolong Xing
提交者:
GitHub
4月 12, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #16814 from NHZlX/cherry_pick_anakin_cpu_1.4
Cherry pick from feature/anakin-engine2
上级
0f18fbfa
4b9fa423
变更
85
隐藏空白更改
内联
并排
Showing
85 changed file
with
2238 addition
and
874 deletion
+2238
-874
cmake/anakin_subgraph.cmake
cmake/anakin_subgraph.cmake
+2
-1
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+20
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+12
-13
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+2
-1
paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+19
-9
paddle/fluid/inference/anakin/convert/CMakeLists.txt
paddle/fluid/inference/anakin/convert/CMakeLists.txt
+7
-2
paddle/fluid/inference/anakin/convert/activation.cc
paddle/fluid/inference/anakin/convert/activation.cc
+47
-14
paddle/fluid/inference/anakin/convert/activation.h
paddle/fluid/inference/anakin/convert/activation.h
+9
-5
paddle/fluid/inference/anakin/convert/affine_channel.cc
paddle/fluid/inference/anakin/convert/affine_channel.cc
+73
-0
paddle/fluid/inference/anakin/convert/affine_channel.h
paddle/fluid/inference/anakin/convert/affine_channel.h
+40
-0
paddle/fluid/inference/anakin/convert/batch_norm.cc
paddle/fluid/inference/anakin/convert/batch_norm.cc
+53
-82
paddle/fluid/inference/anakin/convert/batch_norm.h
paddle/fluid/inference/anakin/convert/batch_norm.h
+2
-1
paddle/fluid/inference/anakin/convert/concat.cc
paddle/fluid/inference/anakin/convert/concat.cc
+25
-18
paddle/fluid/inference/anakin/convert/concat.h
paddle/fluid/inference/anakin/convert/concat.h
+2
-1
paddle/fluid/inference/anakin/convert/conv2d.cc
paddle/fluid/inference/anakin/convert/conv2d.cc
+73
-35
paddle/fluid/inference/anakin/convert/conv2d.h
paddle/fluid/inference/anakin/convert/conv2d.h
+2
-1
paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+76
-58
paddle/fluid/inference/anakin/convert/conv2d_fusion.h
paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+2
-1
paddle/fluid/inference/anakin/convert/density_prior_box.cc
paddle/fluid/inference/anakin/convert/density_prior_box.cc
+48
-23
paddle/fluid/inference/anakin/convert/density_prior_box.h
paddle/fluid/inference/anakin/convert/density_prior_box.h
+3
-1
paddle/fluid/inference/anakin/convert/detection_out.cc
paddle/fluid/inference/anakin/convert/detection_out.cc
+36
-22
paddle/fluid/inference/anakin/convert/detection_out.h
paddle/fluid/inference/anakin/convert/detection_out.h
+2
-1
paddle/fluid/inference/anakin/convert/dropout.cc
paddle/fluid/inference/anakin/convert/dropout.cc
+31
-25
paddle/fluid/inference/anakin/convert/dropout.h
paddle/fluid/inference/anakin/convert/dropout.h
+2
-1
paddle/fluid/inference/anakin/convert/elementwise.cc
paddle/fluid/inference/anakin/convert/elementwise.cc
+43
-29
paddle/fluid/inference/anakin/convert/elementwise.h
paddle/fluid/inference/anakin/convert/elementwise.h
+6
-2
paddle/fluid/inference/anakin/convert/fc.cc
paddle/fluid/inference/anakin/convert/fc.cc
+94
-60
paddle/fluid/inference/anakin/convert/fc.h
paddle/fluid/inference/anakin/convert/fc.h
+6
-3
paddle/fluid/inference/anakin/convert/flatten.cc
paddle/fluid/inference/anakin/convert/flatten.cc
+25
-11
paddle/fluid/inference/anakin/convert/flatten.h
paddle/fluid/inference/anakin/convert/flatten.h
+2
-1
paddle/fluid/inference/anakin/convert/helper.cc
paddle/fluid/inference/anakin/convert/helper.cc
+32
-0
paddle/fluid/inference/anakin/convert/helper.h
paddle/fluid/inference/anakin/convert/helper.h
+95
-0
paddle/fluid/inference/anakin/convert/im2sequence.cc
paddle/fluid/inference/anakin/convert/im2sequence.cc
+26
-17
paddle/fluid/inference/anakin/convert/im2sequence.h
paddle/fluid/inference/anakin/convert/im2sequence.h
+2
-1
paddle/fluid/inference/anakin/convert/op_converter.h
paddle/fluid/inference/anakin/convert/op_converter.h
+87
-34
paddle/fluid/inference/anakin/convert/pool2d.cc
paddle/fluid/inference/anakin/convert/pool2d.cc
+30
-19
paddle/fluid/inference/anakin/convert/pool2d.h
paddle/fluid/inference/anakin/convert/pool2d.h
+2
-1
paddle/fluid/inference/anakin/convert/relu.cc
paddle/fluid/inference/anakin/convert/relu.cc
+56
-12
paddle/fluid/inference/anakin/convert/relu.h
paddle/fluid/inference/anakin/convert/relu.h
+14
-1
paddle/fluid/inference/anakin/convert/reshape.cc
paddle/fluid/inference/anakin/convert/reshape.cc
+25
-11
paddle/fluid/inference/anakin/convert/reshape.h
paddle/fluid/inference/anakin/convert/reshape.h
+2
-1
paddle/fluid/inference/anakin/convert/roi_align.cc
paddle/fluid/inference/anakin/convert/roi_align.cc
+72
-0
paddle/fluid/inference/anakin/convert/roi_align.h
paddle/fluid/inference/anakin/convert/roi_align.h
+39
-0
paddle/fluid/inference/anakin/convert/scale.cc
paddle/fluid/inference/anakin/convert/scale.cc
+27
-14
paddle/fluid/inference/anakin/convert/scale.h
paddle/fluid/inference/anakin/convert/scale.h
+2
-1
paddle/fluid/inference/anakin/convert/softmax.cc
paddle/fluid/inference/anakin/convert/softmax.cc
+26
-12
paddle/fluid/inference/anakin/convert/softmax.h
paddle/fluid/inference/anakin/convert/softmax.h
+2
-1
paddle/fluid/inference/anakin/convert/split.cc
paddle/fluid/inference/anakin/convert/split.cc
+29
-16
paddle/fluid/inference/anakin/convert/split.h
paddle/fluid/inference/anakin/convert/split.h
+2
-1
paddle/fluid/inference/anakin/convert/sum.cc
paddle/fluid/inference/anakin/convert/sum.cc
+27
-14
paddle/fluid/inference/anakin/convert/sum.h
paddle/fluid/inference/anakin/convert/sum.h
+2
-1
paddle/fluid/inference/anakin/convert/test_activation_op.cc
paddle/fluid/inference/anakin/convert/test_activation_op.cc
+39
-6
paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
.../fluid/inference/anakin/convert/test_affine_channel_op.cc
+75
-0
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+22
-2
paddle/fluid/inference/anakin/convert/test_concat_op.cc
paddle/fluid/inference/anakin/convert/test_concat_op.cc
+19
-22
paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+22
-5
paddle/fluid/inference/anakin/convert/test_dropout_op.cc
paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+21
-2
paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+36
-5
paddle/fluid/inference/anakin/convert/test_fc_op.cc
paddle/fluid/inference/anakin/convert/test_fc_op.cc
+22
-5
paddle/fluid/inference/anakin/convert/test_flatten_op.cc
paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+21
-5
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+50
-46
paddle/fluid/inference/anakin/convert/test_relu_op.cc
paddle/fluid/inference/anakin/convert/test_relu_op.cc
+30
-5
paddle/fluid/inference/anakin/convert/test_reshape_op.cc
paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+38
-6
paddle/fluid/inference/anakin/convert/test_softmax_op.cc
paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+22
-4
paddle/fluid/inference/anakin/convert/test_split_op.cc
paddle/fluid/inference/anakin/convert/test_split_op.cc
+43
-31
paddle/fluid/inference/anakin/convert/test_sum_op.cc
paddle/fluid/inference/anakin/convert/test_sum_op.cc
+21
-2
paddle/fluid/inference/anakin/convert/test_transpose_op.cc
paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+37
-7
paddle/fluid/inference/anakin/convert/transpose.cc
paddle/fluid/inference/anakin/convert/transpose.cc
+21
-11
paddle/fluid/inference/anakin/convert/transpose.h
paddle/fluid/inference/anakin/convert/transpose.h
+2
-1
paddle/fluid/inference/anakin/convert/ut_helper.h
paddle/fluid/inference/anakin/convert/ut_helper.h
+34
-26
paddle/fluid/inference/anakin/engine.cc
paddle/fluid/inference/anakin/engine.cc
+75
-12
paddle/fluid/inference/anakin/engine.h
paddle/fluid/inference/anakin/engine.h
+35
-17
paddle/fluid/inference/anakin/op_teller.cc
paddle/fluid/inference/anakin/op_teller.cc
+2
-0
paddle/fluid/inference/anakin/test_anakin_engine.cc
paddle/fluid/inference/anakin/test_anakin_engine.cc
+3
-6
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+20
-14
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+6
-0
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
...luid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+74
-14
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
...fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+8
-0
paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+0
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+18
-5
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+6
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+6
-1
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+6
-5
paddle/fluid/operators/anakin/anakin_engine_op.h
paddle/fluid/operators/anakin/anakin_engine_op.h
+32
-28
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+9
-0
未找到文件。
cmake/anakin_subgraph.cmake
浏览文件 @
9f1927bd
...
...
@@ -25,8 +25,9 @@ endif()
if
(
ANAKIN_FOUND
)
message
(
STATUS
"Current ANAKIN header is
${
ANAKIN_INCLUDE_DIR
}
/anakin_config.h. "
)
include_directories
(
${
ANAKIN_ROOT
}
)
include_directories
(
${
ANAKIN_ROOT
}
/include
)
include_directories
(
${
ANAKIN_ROOT
}
/
include/
saber
)
include_directories
(
${
ANAKIN_ROOT
}
/saber
)
link_directories
(
${
ANAKIN_ROOT
}
)
add_definitions
(
-DPADDLE_WITH_ANAKIN
)
endif
()
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
9f1927bd
...
...
@@ -48,17 +48,37 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH
(
elementwise_add
,
elementwise_add
,
fc_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
mul_out
,
mul_out
,
fc_pattern
);
auto
base_op_desc
=
mul
->
Op
();
// Create an FC Node.
// OpDesc desc(base_op_desc, nullptr);
OpDesc
desc
;
std
::
string
fc_x_in
=
subgraph
.
at
(
x
)
->
Name
();
std
::
string
fc_Y_in
=
w
->
Name
();
std
::
string
fc_bias_in
=
fc_bias
->
Name
();
std
::
string
fc_out_out
=
fc_out
->
Name
();
desc
.
SetInput
(
"Input"
,
std
::
vector
<
std
::
string
>
({
fc_x_in
}));
desc
.
SetInput
(
"W"
,
std
::
vector
<
std
::
string
>
({
fc_Y_in
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
fc_bias_in
}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
fc_out_out
}));
desc
.
SetAttr
(
"in_num_col_dims"
,
mul
->
Op
()
->
GetAttr
(
"x_num_col_dims"
));
// For anakin subgraph int8
// When in anakin subgraph int8 mode, the pattern like "fake_quant + mul +
// fake_dequant"
// can be detected by the quant_dequant_fuse_pass. This pass will add
// "input_scale",
// "weight_scale" which are extracted from fake_quant op and fake_dequant op
// to mul op,
// and then delete the fake_quant op and fake_dequant op in the graph. If
// the mul op
// has the scale info, we should add those to the fused fc.
if
(
base_op_desc
->
HasAttr
(
"enable_int8"
))
{
desc
.
SetAttr
(
"enable_int8"
,
base_op_desc
->
GetAttr
(
"enable_int8"
));
desc
.
SetAttr
(
"input_scale"
,
base_op_desc
->
GetAttr
(
"input_scale"
));
desc
.
SetAttr
(
"weight_scale"
,
base_op_desc
->
GetAttr
(
"weight_scale"
));
}
desc
.
SetType
(
"fc"
);
auto
fc_node
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
GraphSafeRemoveNodes
(
graph
,
{
mul
,
elementwise_add
,
mul_out
});
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
9f1927bd
...
...
@@ -1640,7 +1640,8 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
void
patterns
::
QuantDequantOpFuse
::
operator
()(
PDNode
*
quant_op_input
,
const
std
::
string
&
op_type
,
const
std
::
string
&
weight_name
,
int
times
)
{
int
times
,
const
std
::
string
&
quant_type
)
{
const
int
kNumFields
=
5
;
const
int
kQuantizedWeightOffset
=
0
;
const
int
kQuantizedOpOffset
=
1
;
...
...
@@ -1648,24 +1649,22 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const
int
kDequantOpOffset
=
3
;
const
int
kDequantOpOutOffset
=
4
;
// the quant op always be one.
auto
quant_op_in_scale
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op_in_scale"
))
->
assert_is_op_input
(
"fake_quantize_range_abs_max"
,
"InScale"
)
->
AsInput
();
auto
quant_op
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op"
))
->
assert_is_op
(
"fake_quantize_range_abs_max"
);
auto
quant_op_in_scale
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op_in_scale"
))
->
assert_is_op_input
(
quant_type
,
"InScale"
)
->
AsInput
();
auto
quant_op
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op"
))
->
assert_is_op
(
quant_type
);
auto
quant_op_out_scale
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op_out_scale"
))
->
assert_is_op_output
(
"fake_quantize_range_abs_max"
,
"OutScale"
)
->
assert_is_op_output
(
quant_type
,
"OutScale"
)
->
assert_is_op_input
(
"fake_dequantize_max_abs"
,
"Scale"
)
->
AsIntermediate
();
auto
quant_op_out
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op_out"
))
->
assert_is_op_output
(
"fake_quantize_range_abs_max"
,
"Out"
)
->
assert_is_op_input
(
op_type
)
->
AsIntermediate
();
auto
quant_op_out
=
pattern
->
NewNode
(
GetNodeName
(
"quant_op_out"
))
->
assert_is_op_output
(
quant_type
,
"Out"
)
->
assert_is_op_input
(
op_type
)
->
AsIntermediate
();
// there are 'times' quantized and dequant op
std
::
vector
<
PDNode
*>
nodes
;
...
...
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
9f1927bd
...
...
@@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase {
:
PatternBase
(
pattern
,
name_scope
,
"quant_dequant_fuse"
)
{}
void
operator
()(
PDNode
*
quant_op_input
,
const
std
::
string
&
op_name
,
const
std
::
string
&
weight_name
,
int
times
=
1
);
const
std
::
string
&
weight_name
,
int
times
,
const
std
::
string
&
quant_type
);
std
::
string
GetNodeName
(
const
std
::
string
&
op_type
)
{
return
PDNodeName
(
name_scope_
,
repr_
,
id_
,
op_type
);
...
...
paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
浏览文件 @
9f1927bd
...
...
@@ -25,7 +25,8 @@ namespace framework {
namespace
ir
{
void
RunQuantDequant
(
ir
::
Graph
*
graph
,
Scope
*
scope
,
int
times
,
std
::
string
op_type
)
{
const
std
::
string
&
op_type
,
const
std
::
string
&
quant_type
)
{
const
std
::
string
pattern_name
=
"quant_dequant_fuse"
;
// FusePassBase::Init(pattern_name, graph);
const
int
kNumFields
=
5
;
...
...
@@ -38,7 +39,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
)
->
assert_is_op_input
(
"fake_quantize_range_abs_max"
,
"X"
)
->
assert_is_op_input
(
quant_type
,
"X"
)
->
AsInput
();
std
::
string
quantized_op_type
=
""
;
...
...
@@ -46,6 +47,9 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
if
(
op_type
==
"conv2d"
)
{
quantized_op_type
=
"conv2d"
;
weight_name
=
"Filter"
;
}
else
if
(
op_type
==
"depthwise_conv2d"
)
{
quantized_op_type
=
"depthwise_conv2d"
;
weight_name
=
"Filter"
;
}
else
if
(
op_type
==
"conv2d_fusion"
)
{
quantized_op_type
=
"conv2d_fusion"
;
weight_name
=
"Filter"
;
...
...
@@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
}
patterns
::
QuantDequantOpFuse
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
x
,
quantized_op_type
,
weight_name
,
times
);
pattern
(
x
,
quantized_op_type
,
weight_name
,
times
,
quant_type
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
...
...
@@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std
::
unordered_set
<
const
Node
*>
delete_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
// max_range = (range * range) / weight_scale
float
max_range
=
boost
::
get
<
float
>
(
nodes
[
i
*
kNumFields
+
kDequantOpOffset
]
->
Op
()
->
GetAttr
(
"max_range"
));
float
weight_scale
=
(
range
*
range
)
/
max_range
;
...
...
@@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
new_op_desc
.
SetType
(
quantized_op_type
);
if
(
quantized_op_type
==
"conv2d"
||
quantized_op_type
==
"conv2d_fusion"
)
{
quantized_op_type
==
"conv2d_fusion"
||
quantized_op_type
==
"depthwise_conv2d"
)
{
new_op_desc
.
SetInput
(
"Input"
,
{
new_input
});
new_op_desc
.
SetOutput
(
"Output"
,
{
new_output
});
}
else
if
(
quantized_op_type
==
"fc"
)
{
...
...
@@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
const
std
::
string
pattern_name
=
"quant_dequant_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
);
std
::
unordered_set
<
std
::
string
>
quantized_op_types
=
{
"conv2d"
,
"mul"
};
std
::
unordered_set
<
std
::
string
>
quant_types
=
{
"fake_quantize_range_abs_max"
,
"fake_quantize_moving_average_abs_max"
};
std
::
unordered_set
<
std
::
string
>
quantized_op_types
=
{
"conv2d"
,
"mul"
,
"depthwise_conv2d"
};
auto
*
scope
=
param_scope
();
for
(
auto
&
op_type
:
quantized_op_types
)
{
for
(
int
i
=
1
;
i
<=
6
;
i
++
)
{
RunQuantDequant
(
graph
,
scope
,
i
,
op_type
);
for
(
auto
&
quant_type
:
quant_types
)
{
for
(
auto
&
op_type
:
quantized_op_types
)
{
for
(
int
i
=
6
;
i
>=
1
;
i
--
)
{
RunQuantDequant
(
graph
,
scope
,
i
,
op_type
,
quant_type
);
}
}
}
}
...
...
paddle/fluid/inference/anakin/convert/CMakeLists.txt
浏览文件 @
9f1927bd
cc_library
(
anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry
)
cc_library
(
anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry
gtest
)
cc_test
(
test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL
)
cc_test
(
test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL
)
...
...
@@ -14,5 +19,5 @@ cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter fla
cc_test
(
test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL
)
cc_test
(
test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL
)
cc_test
(
test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL
)
#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
cc_test
(
test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL
)
cc_test
(
test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op SERIAL
)
paddle/fluid/inference/anakin/convert/activation.cc
浏览文件 @
9f1927bd
...
...
@@ -16,16 +16,13 @@
#include <algorithm>
#include <map>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
ActivationOpConverter
::
ActivationOpConverter
(
const
std
::
string
&
op_type
)
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
ActivationOpConverter
<
TargetT
,
PrecisionT
>::
ActivationOpConverter
(
const
std
::
string
&
op_type
)
:
op_type_
(
op_type
)
{
auto
it
=
anakin_op_types_
.
find
(
op_type_
);
PADDLE_ENFORCE
(
it
!=
anakin_op_types_
.
end
(),
...
...
@@ -33,10 +30,10 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
anakin_op_type_
=
it
->
second
;
}
void
ActivationOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ActivationOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -44,13 +41,49 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
auto
input_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Activation"
,
{
input_name
},
{
output_name
});
engine_
->
AddOpAttr
(
op_name
,
"type"
,
anakin_op_type_
);
this
->
engine_
->
AddOp
(
op_name
,
"Activation"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"type"
,
anakin_op_type_
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
sigmoid
,
SigmoidOpConverter
);
REGISTER_ANAKIN_OP_CONVERTER
(
tanh
,
TanhOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
sigmoid_nv_fp32
=
::
paddle
::
inference
::
anakin
::
SigmoidOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
sigmoid_nv_int8
=
::
paddle
::
inference
::
anakin
::
SigmoidOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
using
tanh_nv_fp32
=
::
paddle
::
inference
::
anakin
::
TanhOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
tanh_nv_int8
=
::
paddle
::
inference
::
anakin
::
TanhOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
sigmoid
,
sigmoid_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
sigmoid
,
sigmoid_nv_int8
);
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
tanh
,
tanh_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
tanh
,
tanh_nv_int8
);
#endif
using
sigmoid_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
SigmoidOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
sigmoid_cpu_int8
=
::
paddle
::
inference
::
anakin
::
SigmoidOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
using
tanh_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
TanhOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
tanh_cpu_int8
=
::
paddle
::
inference
::
anakin
::
TanhOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
sigmoid
,
sigmoid_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
sigmoid
,
sigmoid_cpu_int8
);
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
tanh
,
tanh_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
tanh
,
tanh_cpu_int8
);
paddle/fluid/inference/anakin/convert/activation.h
浏览文件 @
9f1927bd
...
...
@@ -22,7 +22,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ActivationOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ActivationOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
explicit
ActivationOpConverter
(
const
std
::
string
&
op_type
);
...
...
@@ -39,14 +40,17 @@ class ActivationOpConverter : public AnakinOpConverter {
{
"sigmoid"
,
"Sigmoid"
}};
};
class
TanhOpConverter
:
public
ActivationOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
TanhOpConverter
:
public
ActivationOpConverter
<
TargetT
,
PrecisionT
>
{
public:
TanhOpConverter
()
:
ActivationOpConverter
(
"tanh"
)
{}
TanhOpConverter
()
:
ActivationOpConverter
<
TargetT
,
PrecisionT
>
(
"tanh"
)
{}
};
class
SigmoidOpConverter
:
public
ActivationOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
SigmoidOpConverter
:
public
ActivationOpConverter
<
TargetT
,
PrecisionT
>
{
public:
SigmoidOpConverter
()
:
ActivationOpConverter
(
"sigmoid"
)
{}
SigmoidOpConverter
()
:
ActivationOpConverter
<
TargetT
,
PrecisionT
>
(
"sigmoid"
)
{}
};
}
// namespace anakin
}
// namespace inference
...
...
paddle/fluid/inference/anakin/convert/affine_channel.cc
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
AffineChannelOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
auto
input_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
this
->
engine_
->
AddOp
(
op_name
,
"AffineChannel"
,
{
input_name
},
{
output_name
});
// Copy the Scale to CPUPlace and get the pointer.
auto
*
scale_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Scale"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
scale_v
);
auto
weight1
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
scale_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
// Copy the Bias to CPUPlace and get the pointer.
auto
*
bias_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Bias"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
bias_v
);
auto
weight2
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
bias_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_2"
,
*
weight2
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
#ifdef PADDLE_WITH_CUDA
using
affine_channel_nv_fp32
=
::
paddle
::
inference
::
anakin
::
AffineChannelOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
affine_channel_nv_int8
=
::
paddle
::
inference
::
anakin
::
AffineChannelOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
affine_channel
,
affine_channel_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
affine_channel
,
affine_channel_nv_int8
);
#endif
using
affine_channel_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
AffineChannelOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
affine_channel_cpu_int8
=
::
paddle
::
inference
::
anakin
::
AffineChannelOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
affine_channel
,
affine_channel_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
affine_channel
,
affine_channel_cpu_int8
);
paddle/fluid/inference/anakin/convert/affine_channel.h
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
AffineChannelOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
AffineChannelOpConverter
()
=
default
;
virtual
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
;
virtual
~
AffineChannelOpConverter
()
{}
private:
};
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/convert/batch_norm.cc
浏览文件 @
9f1927bd
...
...
@@ -18,111 +18,82 @@
#include <map>
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
BatchNormOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
BatchNormOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Y"
).
size
(),
1
);
std
::
map
<
std
::
string
,
std
::
string
>
inputs
;
for
(
auto
k
:
{
"X"
,
"Scale"
,
"Bias"
,
"Mean"
,
"Variance"
})
{
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
k
).
size
(),
1UL
);
auto
v
=
op_desc
.
Input
(
k
).
front
();
inputs
.
insert
({
k
,
v
});
}
auto
input
=
op_desc
.
Input
(
"X"
).
front
();
auto
output
=
op_desc
.
Output
(
"Y"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Y"
).
front
();
auto
epsilon
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"epsilon"
));
// auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
auto
bn_op_name
=
op_name
+
":bn"
;
auto
bn_output
=
bn_op_name
+
"_output"
;
engine_
->
AddOp
(
bn_op_name
,
"BatchNorm"
,
{
inputs
[
"X"
]
},
{
bn_output
});
engine_
->
AddOpAttr
(
bn_op_name
,
"epsilon"
,
epsilon
);
engine_
->
AddOpAttr
(
bn_op_name
,
"momentum"
,
static_cast
<
float
>
(
1.0
));
this
->
engine_
->
AddOp
(
bn_op_name
,
"BatchNorm"
,
{
input
},
{
bn_output
});
this
->
engine_
->
AddOpAttr
(
bn_op_name
,
"epsilon"
,
epsilon
);
this
->
engine_
->
AddOpAttr
(
bn_op_name
,
"momentum"
,
static_cast
<
float
>
(
1.0
));
auto
scale_op_name
=
op_name
+
":scale"
;
auto
get_lod_tensor
=
[
this
,
&
scope
,
&
op_name
](
const
std
::
string
&
var_name
,
framework
::
LoDTensor
*
tensor
)
{
auto
*
v
=
scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
v
);
auto
*
t
=
v
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
t
->
dims
());
TensorCopySync
(
*
t
,
platform
::
CPUPlace
(),
tensor
);
};
framework
::
LoDTensor
bias_t
;
framework
::
LoDTensor
mean_t
;
framework
::
LoDTensor
scale_t
;
framework
::
LoDTensor
variance_t
;
get_lod_tensor
(
inputs
[
"Bias"
],
&
bias_t
);
get_lod_tensor
(
inputs
[
"Mean"
],
&
mean_t
);
get_lod_tensor
(
inputs
[
"Scale"
],
&
scale_t
);
get_lod_tensor
(
inputs
[
"Variance"
],
&
variance_t
);
auto
fill_shape
=
[](
size_t
n
,
std
::
vector
<
int
>
shape
)
{
shape
.
insert
(
shape
.
begin
(),
1
);
if
(
shape
.
size
()
<
n
)
{
shape
.
insert
(
shape
.
end
(),
n
-
shape
.
size
(),
1
);
}
return
shape
;
};
Shape
shape1
(
fill_shape
(
4
,
framework
::
vectorize2int
(
mean_t
.
dims
())));
Shape
shape2
(
fill_shape
(
4
,
framework
::
vectorize2int
(
variance_t
.
dims
())));
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
shape1
);
auto
*
mean_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
mean_t
.
data
<
float
>
(),
mean_t
.
numel
(),
mean_data
);
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_1"
,
*
weight1
);
auto
*
weight2
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
shape2
);
auto
*
variance_data
=
static_cast
<
float
*>
(
weight2
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
variance_t
.
data
<
float
>
(),
variance_t
.
numel
(),
variance_data
);
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_2"
,
*
weight2
);
Shape
shape3
(
std
::
vector
<
int
>
({
1
,
1
,
1
,
1
}));
auto
*
weight3
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
shape3
);
auto
*
alpha_data
=
static_cast
<
float
*>
(
weight3
->
h_tensor
().
mutable_data
());
float
weight3_data
[]
=
{
1
};
std
::
copy
(
std
::
begin
(
weight3_data
),
std
::
end
(
weight3_data
),
alpha_data
);
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_3"
,
*
weight3
);
Shape
scale_shape
(
fill_shape
(
4
,
framework
::
vectorize2int
(
scale_t
.
dims
())));
auto
*
scale
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
scale_shape
);
auto
*
scale_data
=
static_cast
<
float
*>
(
scale
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
scale_t
.
data
<
float
>
(),
scale_t
.
numel
(),
scale_data
);
Shape
bias_shape
(
fill_shape
(
4
,
framework
::
vectorize2int
(
bias_t
.
dims
())));
auto
*
bias
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
bias_shape
);
auto
*
bias_data
=
static_cast
<
float
*>
(
bias
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
bias_t
.
data
<
float
>
(),
bias_t
.
numel
(),
bias_data
);
engine_
->
AddOp
(
scale_op_name
,
"Scale"
,
{
bn_output
},
{
output
});
engine_
->
AddOpAttr
(
scale_op_name
,
"axis"
,
1
);
engine_
->
AddOpAttr
(
scale_op_name
,
"num_axes"
,
1
);
engine_
->
AddOpAttr
(
scale_op_name
,
"bias_term"
,
true
);
engine_
->
AddOpAttr
(
scale_op_name
,
"weight_1"
,
*
scale
);
engine_
->
AddOpAttr
(
scale_op_name
,
"weight_2"
,
*
bias
);
this
->
engine_
->
AddOp
(
scale_op_name
,
"Scale"
,
{
bn_output
},
{
output
});
this
->
engine_
->
AddOpAttr
(
scale_op_name
,
"axis"
,
1
);
this
->
engine_
->
AddOpAttr
(
scale_op_name
,
"num_axes"
,
1
);
this
->
engine_
->
AddOpAttr
(
scale_op_name
,
"bias_term"
,
true
);
auto
*
mean_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Mean"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
mean_v
);
auto
weight1
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
mean_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_1"
,
*
weight1
);
auto
*
variance_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Variance"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
variance_v
);
auto
weight2
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
variance_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_2"
,
*
weight2
);
auto
*
weight3
=
pblock_from_vector
<
TargetT
,
PrecisionT
>
(
std
::
vector
<
float
>
({
1
}),
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
bn_op_name
,
"weight_3"
,
*
weight3
);
auto
*
scale_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Scale"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
scale_v
);
auto
scale
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
scale_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
scale_op_name
,
"weight_1"
,
*
scale
);
auto
*
bias_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Bias"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
bias_v
);
auto
bias
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
bias_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
scale_op_name
,
"weight_2"
,
*
bias
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
batch_norm
,
BatchNormOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
bn_nv_fp32
=
::
paddle
::
inference
::
anakin
::
BatchNormOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
bn_nv_int8
=
::
paddle
::
inference
::
anakin
::
BatchNormOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
batch_norm
,
bn_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
batch_norm
,
bn_nv_int8
);
#endif
using
bn_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
BatchNormOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
bn_cpu_int8
=
::
paddle
::
inference
::
anakin
::
BatchNormOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
batch_norm
,
bn_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
batch_norm
,
bn_cpu_int8
);
paddle/fluid/inference/anakin/convert/batch_norm.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
BatchNormOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
BatchNormOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
BatchNormOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/concat.cc
浏览文件 @
9f1927bd
...
...
@@ -15,38 +15,45 @@
#include "paddle/fluid/inference/anakin/convert/concat.h"
#include <algorithm>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
ConcatOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ConcatOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
auto
input_names
=
op_desc
.
Input
(
"X"
);
// PADDLE_ENFORCE(axis > 0,
// "The axis attr of Concat op should be large than 0 for trt");
auto
y_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Concat"
,
input_names
,
{
y_name
});
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
axis
);
this
->
engine_
->
AddOp
(
op_name
,
"Concat"
,
input_names
,
{
y_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
axis
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
concat
,
ConcatOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
concat_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ConcatOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
concat_nv_int8
=
::
paddle
::
inference
::
anakin
::
ConcatOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
concat
,
concat_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
concat
,
concat_nv_int8
);
#endif
using
concat_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ConcatOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
concat_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ConcatOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
concat
,
concat_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
concat
,
concat_cpu_int8
);
paddle/fluid/inference/anakin/convert/concat.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ConcatOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ConcatOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ConcatOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/conv2d.cc
浏览文件 @
9f1927bd
...
...
@@ -16,21 +16,18 @@
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/inference/anakin/convert/helper.h"
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
Conv2dOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
Conv2dOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Input"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Filter"
).
size
(),
1UL
);
...
...
@@ -39,50 +36,91 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
auto
input_name
=
op_desc
.
Input
(
"Input"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Output"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Output"
).
front
();
engine_
->
AddOp
(
op_name
,
"Convolution"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOp
(
op_name
,
"Convolution"
,
{
input_name
},
{
output_name
});
auto
*
filter_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
filter_v
);
auto
*
filter_t
=
filter_v
->
GetMutable
<
framework
::
LoDTensor
>
();
std
::
unique_ptr
<
framework
::
LoDTensor
>
weight_tensor
(
new
framework
::
LoDTensor
());
weight_tensor
->
Resize
(
filter_t
->
dims
());
TensorCopySync
((
*
filter_t
),
platform
::
CPUPlace
(),
weight_tensor
.
get
());
auto
weight_tensor
=
tensor_from_var
(
*
filter_v
,
platform
::
CPUPlace
());
auto
weight_shape
=
framework
::
vectorize2int
(
weight_tensor
->
dims
());
PADDLE_ENFORCE_EQ
(
weight_tensor
->
dims
().
size
(),
4UL
);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const
int
filter_h
=
weight_tensor
->
dims
()[
2
];
const
int
filter_w
=
weight_tensor
->
dims
()[
3
];
// auto filter_num = n_input * filter_h * filter_w ;
auto
filter_num
=
weight_tensor
->
dims
()[
0
];
engine_
->
AddOpAttr
<
int
>
(
op_name
,
"filter_num"
,
filter_num
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"kernel_size"
,
{
filter_h
,
filter_w
});
this
->
engine_
->
template
AddOpAttr
<
int
>(
op_name
,
"filter_num"
,
filter_num
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"kernel_size"
,
{
filter_h
,
filter_w
});
auto
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"strides"
,
strides
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"strides"
,
strides
);
auto
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"padding"
,
paddings
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"padding"
,
paddings
);
auto
dilations
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"dilations"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dilation_rate"
,
dilations
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dilation_rate"
,
dilations
);
const
int
groups
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"groups"
));
engine_
->
AddOpAttr
(
op_name
,
"group"
,
groups
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
false
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"group"
,
groups
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
false
);
::
anakin
::
saber
::
Shape
anakin_shape
(
weight_shape
);
bool
enable_int8
=
boost
::
get
<
bool
>
(
op_desc
.
HasAttr
(
"enable_int8"
));
auto
weight_shape
=
framework
::
vectorize2int
(
filter_t
->
dims
());
Shape
anakin_shape
(
weight_shape
);
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
anakin_shape
);
float
*
cpu_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
weight_tensor
->
data
<
float
>
(),
weight_tensor
->
numel
(),
cpu_data
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
if
(
enable_int8
)
{
const
float
int8_range
=
127.
;
float
in_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"input_scale"
));
float
weight_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"weight_scale"
));
PBlock
<
TargetT
>
*
weight1
=
new
PBlock
<
TargetT
>
(
anakin_shape
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
RegistBlock
(
weight1
);
float
*
weight_data
=
weight_tensor
->
data
<
float
>
();
std
::
vector
<
char
>
weight_int8
;
int
weight_num
=
weight_tensor
->
numel
();
for
(
int
i
=
0
;
i
<
weight_tensor
->
numel
();
i
++
)
{
bool
is_valid_int8
=
((
weight_data
[
i
]
>=
-
128
)
&&
(
weight_data
[
i
]
<=
127
));
PADDLE_ENFORCE
(
is_valid_int8
,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]"
);
weight_int8
.
push_back
(
static_cast
<
char
>
(
weight_data
[
i
]));
}
memcpy
(
static_cast
<
void
*>
(
weight1
->
h_tensor
().
mutable_data
()),
static_cast
<
void
*>
(
weight_int8
.
data
()),
sizeof
(
char
)
*
weight_num
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
this
->
engine_
->
Graph
()
->
SetOpPrec
(
op_name
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
Graph
()
->
SetWeightsScale
(
op_name
,
{
weight_scale
/
int8_range
},
false
);
this
->
engine_
->
AddTensorScale
(
input_name
,
in_scale
/
int8_range
);
}
else
{
auto
*
weight1
=
pblock_from_tensor
<
TargetT
,
PrecisionT
>
(
*
weight_tensor
,
weight_shape
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
}
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
conv2d
,
Conv2dOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
conv2d_nv_fp32
=
::
paddle
::
inference
::
anakin
::
Conv2dOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
conv2d_nv_int8
=
::
paddle
::
inference
::
anakin
::
Conv2dOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
conv2d
,
conv2d_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
conv2d
,
conv2d_nv_int8
);
#endif
using
conv2d_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
Conv2dOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
conv2d_cpu_int8
=
::
paddle
::
inference
::
anakin
::
Conv2dOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
conv2d
,
conv2d_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
conv2d
,
conv2d_cpu_int8
);
paddle/fluid/inference/anakin/convert/conv2d.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
Conv2dOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
Conv2dOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
Conv2dOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
浏览文件 @
9f1927bd
...
...
@@ -16,21 +16,18 @@
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/inference/anakin/convert/helper.h"
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
Conv2dFusionOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
Conv2dFusionOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Input"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Filter"
).
size
(),
1UL
);
...
...
@@ -40,75 +37,96 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
auto
input_name
=
op_desc
.
Input
(
"Input"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Output"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Output"
).
front
();
engine_
->
AddOp
(
op_name
,
"Convolution"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOp
(
op_name
,
"Convolution"
,
{
input_name
},
{
output_name
});
auto
*
filter_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
filter_v
);
auto
*
filter_t
=
filter_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
weight_tensor
=
tensor_from_var
(
*
filter_v
,
platform
::
CPUPlace
());
auto
weight_shape
=
framework
::
vectorize2int
(
weight_tensor
->
dims
());
auto
*
b_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Bias"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
b_v
);
auto
*
b_t
=
b_v
->
GetMutable
<
framework
::
LoDTensor
>
();
std
::
unique_ptr
<
framework
::
LoDTensor
>
weight_tensor
(
new
framework
::
LoDTensor
());
weight_tensor
->
Resize
(
filter_t
->
dims
());
TensorCopySync
((
*
filter_t
),
platform
::
CPUPlace
(),
weight_tensor
.
get
());
PADDLE_ENFORCE_EQ
(
weight_tensor
->
dims
().
size
(),
4UL
);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const
int
filter_h
=
weight_tensor
->
dims
()[
2
];
const
int
filter_w
=
weight_tensor
->
dims
()[
3
];
// auto filter_num = n_input * filter_h * filter_w ;
auto
filter_num
=
weight_tensor
->
dims
()[
0
];
engine_
->
AddOpAttr
<
int
>
(
op_name
,
"filter_num"
,
filter_num
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"kernel_size"
,
{
filter_h
,
filter_w
});
this
->
engine_
->
template
AddOpAttr
<
int
>(
op_name
,
"filter_num"
,
filter_num
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"kernel_size"
,
{
filter_h
,
filter_w
});
auto
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"strides"
,
strides
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"strides"
,
strides
);
auto
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"padding"
,
paddings
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"padding"
,
paddings
);
auto
dilations
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"dilations"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dilation_rate"
,
dilations
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dilation_rate"
,
dilations
);
const
int
groups
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"groups"
));
engine_
->
AddOpAttr
(
op_name
,
"group"
,
groups
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
true
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"group"
,
groups
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
true
);
auto
weight_shape
=
framework
::
vectorize2int
(
filter_t
->
dims
());
Shape
anakin_shape
(
weight_shape
);
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
anakin_shape
);
float
*
cpu_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
weight_tensor
->
data
<
float
>
(),
weight_tensor
->
numel
(),
cpu_data
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
auto
bias_shape
=
framework
::
vectorize2int
(
b_t
->
dims
());
framework
::
LoDTensor
bias_tensor
;
bias_tensor
.
Resize
(
b_t
->
dims
());
TensorCopySync
((
*
b_t
),
platform
::
CPUPlace
(),
&
bias_tensor
);
auto
*
bias_data
=
bias_tensor
.
data
<
float
>
();
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape
anakin_bias_shape
(
bias_shape
);
auto
*
weight2
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
anakin_bias_shape
);
float
*
cpu_data2
=
static_cast
<
float
*>
(
weight2
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
bias_data
,
bias_tensor
.
numel
(),
cpu_data2
);
weight2
->
d_tensor
().
set_shape
(
anakin_bias_shape
);
weight2
->
d_tensor
().
copy_from
(
weight2
->
h_tensor
());
engine_
->
AddOpAttr
(
op_name
,
"weight_2"
,
*
weight2
);
::
anakin
::
saber
::
Shape
anakin_shape
(
weight_shape
);
bool
enable_int8
=
boost
::
get
<
bool
>
(
op_desc
.
HasAttr
(
"enable_int8"
));
if
(
enable_int8
)
{
const
float
int8_range
=
127.
;
float
in_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"input_scale"
));
float
weight_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"weight_scale"
));
PBlock
<
TargetT
>
*
weight1
=
new
PBlock
<
TargetT
>
(
anakin_shape
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
RegistBlock
(
weight1
);
float
*
weight_data
=
weight_tensor
->
data
<
float
>
();
std
::
vector
<
char
>
weight_int8
;
int
weight_num
=
weight_tensor
->
numel
();
for
(
int
i
=
0
;
i
<
weight_tensor
->
numel
();
i
++
)
{
bool
is_valid_int8
=
((
weight_data
[
i
]
>=
-
128
)
&&
(
weight_data
[
i
]
<=
127
));
PADDLE_ENFORCE
(
is_valid_int8
,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]"
);
weight_int8
.
push_back
(
static_cast
<
char
>
(
weight_data
[
i
]));
}
memcpy
(
static_cast
<
void
*>
(
weight1
->
h_tensor
().
mutable_data
()),
static_cast
<
void
*>
(
weight_int8
.
data
()),
sizeof
(
char
)
*
weight_num
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
this
->
engine_
->
Graph
()
->
SetOpPrec
(
op_name
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
Graph
()
->
SetWeightsScale
(
op_name
,
{
weight_scale
/
int8_range
},
false
);
this
->
engine_
->
AddTensorScale
(
input_name
,
in_scale
/
int8_range
);
}
else
{
auto
weight_tensor
=
tensor_from_var
(
*
filter_v
,
platform
::
CPUPlace
());
auto
weight_shape
=
framework
::
vectorize2int
(
weight_tensor
->
dims
());
auto
*
weight1
=
pblock_from_tensor
<
TargetT
,
PrecisionT
>
(
*
weight_tensor
,
weight_shape
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
auto
weight2
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
b_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_2"
,
*
weight2
);
}
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
conv2d_fusion
,
Conv2dFusionOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
conv2d_fusion_nv_fp32
=
::
paddle
::
inference
::
anakin
::
Conv2dFusionOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
conv2d_fusion_nv_int8
=
::
paddle
::
inference
::
anakin
::
Conv2dFusionOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
conv2d_fusion
,
conv2d_fusion_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
conv2d_fusion
,
conv2d_fusion_nv_int8
);
#endif
using
conv2d_fusion_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
Conv2dFusionOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
conv2d_fusion_cpu_int8
=
::
paddle
::
inference
::
anakin
::
Conv2dFusionOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
conv2d_fusion
,
conv2d_fusion_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
conv2d_fusion
,
conv2d_fusion_cpu_int8
);
paddle/fluid/inference/anakin/convert/conv2d_fusion.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
Conv2dFusionOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
Conv2dFusionOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
Conv2dFusionOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/density_prior_box.cc
浏览文件 @
9f1927bd
...
...
@@ -17,17 +17,14 @@
#include <map>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
DensityPriorBoxOpConverter
::
operator
()(
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
DensityPriorBoxOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
...
...
@@ -81,27 +78,55 @@ void DensityPriorBoxOpConverter::operator()(
std
::
vector
<
float
>
temp_v
=
{};
engine_
->
AddOp
(
op_name
,
"PriorBox"
,
{
input_name
,
image_name
},
{
output_name
});
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"min_size"
,
min_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"max_size"
,
max_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"aspect_ratio"
,
aspect_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_size"
,
fixed_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_ratio"
,
fixed_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"density"
,
dens
);
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
is_flip
);
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
is_clip
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"variance"
,
variances
);
engine_
->
AddOpAttr
(
op_name
,
"img_h"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"img_w"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"step_h"
,
step_h
);
engine_
->
AddOpAttr
(
op_name
,
"step_w"
,
step_w
);
engine_
->
AddOpAttr
(
op_name
,
"offset"
,
offset
);
engine_
->
AddOpAttr
<
PTuple
<
std
::
string
>>
(
op_name
,
"order"
,
t_order
);
this
->
engine_
->
AddOp
(
op_name
,
"PriorBox"
,
{
input_name
,
image_name
},
{
output_name
});
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"min_size"
,
min_sizes
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"max_size"
,
max_sizes
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"aspect_ratio"
,
aspect_ratios
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"fixed_size"
,
fixed_sizes
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"fixed_ratio"
,
fixed_ratios
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"density"
,
dens
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
is_flip
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
is_clip
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"variance"
,
variances
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"img_h"
,
static_cast
<
int
>
(
0
));
this
->
engine_
->
AddOpAttr
(
op_name
,
"img_w"
,
static_cast
<
int
>
(
0
));
this
->
engine_
->
AddOpAttr
(
op_name
,
"step_h"
,
step_h
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"step_w"
,
step_w
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"offset"
,
offset
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
std
::
string
>
>
(
op_name
,
"order"
,
t_order
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
density_prior_box
,
DensityPriorBoxOpConverter
);
REGISTER_ANAKIN_OP_CONVERTER
(
prior_box
,
DensityPriorBoxOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
ds_pr_nv_fp32
=
::
paddle
::
inference
::
anakin
::
DensityPriorBoxOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
ds_pr_nv_int8
=
::
paddle
::
inference
::
anakin
::
DensityPriorBoxOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
density_prior_box
,
ds_pr_nv_fp32
);
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
prior_box
,
ds_pr_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
density_prior_box
,
ds_pr_nv_int8
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
prior_box
,
ds_pr_nv_int8
);
#endif
using
ds_pr_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
DensityPriorBoxOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
ds_pr_cpu_int8
=
::
paddle
::
inference
::
anakin
::
DensityPriorBoxOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
density_prior_box
,
ds_pr_cpu_fp32
);
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
prior_box
,
ds_pr_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
density_prior_box
,
ds_pr_cpu_int8
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
prior_box
,
ds_pr_cpu_int8
);
paddle/fluid/inference/anakin/convert/density_prior_box.h
浏览文件 @
9f1927bd
...
...
@@ -22,7 +22,9 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
DensityPriorBoxOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
DensityPriorBoxOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
DensityPriorBoxOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/detection_out.cc
浏览文件 @
9f1927bd
...
...
@@ -16,19 +16,14 @@
#include <algorithm>
#include <map>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
DetectionOutOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
DetectionOutOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
target_name
=
op_desc
.
Input
(
"TargetBox"
).
front
();
auto
prior_box_name
=
op_desc
.
Input
(
"PriorBox"
).
front
();
...
...
@@ -52,22 +47,41 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
"Not support encode_center_size code_type in DetectionOut of anakin"
);
}
engine_
->
AddOp
(
op_name
,
"DetectionOutput"
,
{
target_name
,
scores_name
,
prior_box_name
},
{
output_name
});
engine_
->
AddOpAttr
(
op_name
,
"share_location"
,
true
);
engine_
->
AddOpAttr
(
op_name
,
"variance_encode_in_target"
,
false
);
engine_
->
AddOpAttr
(
op_name
,
"class_num"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"background_id"
,
background_label
);
engine_
->
AddOpAttr
(
op_name
,
"keep_top_k"
,
keep_top_k
);
engine_
->
AddOpAttr
(
op_name
,
"code_type"
,
anakin_code_type
);
engine_
->
AddOpAttr
(
op_name
,
"conf_thresh"
,
score_threshold
);
engine_
->
AddOpAttr
(
op_name
,
"nms_top_k"
,
nms_top_k
);
engine_
->
AddOpAttr
(
op_name
,
"nms_thresh"
,
nms_threshold
);
engine_
->
AddOpAttr
(
op_name
,
"nms_eta"
,
nms_eta
);
this
->
engine_
->
AddOp
(
op_name
,
"DetectionOutput"
,
{
target_name
,
scores_name
,
prior_box_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"share_location"
,
true
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"variance_encode_in_target"
,
false
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"class_num"
,
static_cast
<
int
>
(
0
));
this
->
engine_
->
AddOpAttr
(
op_name
,
"background_id"
,
background_label
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"keep_top_k"
,
keep_top_k
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"code_type"
,
anakin_code_type
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"conf_thresh"
,
score_threshold
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"nms_top_k"
,
nms_top_k
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"nms_thresh"
,
nms_threshold
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"nms_eta"
,
nms_eta
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
detection_out
,
DetectionOutOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
detection_out_nv_fp32
=
::
paddle
::
inference
::
anakin
::
DetectionOutOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
detection_out_nv_int8
=
::
paddle
::
inference
::
anakin
::
DetectionOutOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
detection_out
,
detection_out_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
detection_out
,
detection_out_nv_int8
);
#endif
using
detection_out_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
DetectionOutOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
detection_out_cpu_int8
=
::
paddle
::
inference
::
anakin
::
DetectionOutOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
detection_out
,
detection_out_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
detection_out
,
detection_out_cpu_int8
);
paddle/fluid/inference/anakin/convert/detection_out.h
浏览文件 @
9f1927bd
...
...
@@ -22,7 +22,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
DetectionOutOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
DetectionOutOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
DetectionOutOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/dropout.cc
浏览文件 @
9f1927bd
...
...
@@ -16,24 +16,16 @@
#include <algorithm>
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
DropoutOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
DropoutOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Mask"
).
size
(),
1
);
...
...
@@ -43,25 +35,39 @@ void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
auto
out_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Scale"
,
{
x_name
},
{
out_name
});
this
->
engine_
->
AddOp
(
op_name
,
"Scale"
,
{
x_name
},
{
out_name
});
auto
dropout_prob
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"dropout_prob"
));
auto
factor
=
1
-
dropout_prob
;
Shape
shape1
(
std
::
vector
<
int
>
({
1
,
1
,
1
,
1
}));
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
shape1
);
auto
*
factor_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
float
weight1_data
[]
=
{
factor
};
std
::
copy
(
std
::
begin
(
weight1_data
),
std
::
end
(
weight1_data
),
factor_data
);
auto
*
weight1
=
pblock_from_vector
<
TargetT
,
PrecisionT
>
(
std
::
vector
<
float
>
({
factor
}),
this
->
engine_
);
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
0
);
engine_
->
AddOpAttr
(
op_name
,
"num_axes"
,
0
);
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
false
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
0
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"num_axes"
,
0
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
dropout
,
DropoutOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
dropout_nv_fp32
=
::
paddle
::
inference
::
anakin
::
DropoutOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
dropout_nv_int8
=
::
paddle
::
inference
::
anakin
::
DropoutOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
dropout
,
dropout_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
dropout
,
dropout_nv_int8
);
#endif
using
dropout_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
DropoutOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
dropout_cpu_int8
=
::
paddle
::
inference
::
anakin
::
DropoutOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
dropout
,
dropout_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
dropout
,
dropout_cpu_int8
);
paddle/fluid/inference/anakin/convert/dropout.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
DropoutOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
DropoutOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
DropoutOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/elementwise.cc
浏览文件 @
9f1927bd
...
...
@@ -17,20 +17,14 @@
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
ElementwiseAddOpConverter
::
operator
()(
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ElementwiseAddOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
...
...
@@ -43,14 +37,16 @@ void ElementwiseAddOpConverter::operator()(
auto
out_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Eltwise"
,
{
x_name
,
y_name
},
{
out_name
});
this
->
engine_
->
AddOp
(
op_name
,
"Eltwise"
,
{
x_name
,
y_name
},
{
out_name
});
std
::
string
elementwise_type
=
"Add"
;
engine_
->
AddOpAttr
<
std
::
string
>
(
op_name
,
"type"
,
elementwise_type
);
this
->
engine_
->
template
AddOpAttr
<
std
::
string
>(
op_name
,
"type"
,
elementwise_type
);
std
::
vector
<
float
>
coeff
=
{
1.0
,
1.0
};
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"coeff"
,
coeff
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"coeff"
,
coeff
);
}
void
ElementwiseMulOpConverter
::
operator
()(
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ElementwiseMulOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
...
...
@@ -63,26 +59,44 @@ void ElementwiseMulOpConverter::operator()(
auto
out_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Scale"
,
{
x_name
,
y_name
},
{
out_name
});
// Fill a number to weight_1 as a placeholder.
Shape
shape1
(
std
::
vector
<
int
>
({
1
,
1
,
1
,
1
}));
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
shape1
);
auto
*
placeholder_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
float
weight1_data
[]
=
{
1
};
std
::
copy
(
std
::
begin
(
weight1_data
),
std
::
end
(
weight1_data
),
placeholder_data
);
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
auto
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
axis
);
engine_
->
AddOpAttr
(
op_name
,
"num_axes"
,
1
);
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
false
);
this
->
engine_
->
AddOp
(
op_name
,
"Eltwise"
,
{
x_name
,
y_name
},
{
out_name
});
std
::
string
elementwise_type
=
"Prod"
;
this
->
engine_
->
template
AddOpAttr
<
std
::
string
>(
op_name
,
"type"
,
elementwise_type
);
std
::
vector
<
float
>
coeff
=
{
1.0
,
1.0
};
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"coeff"
,
coeff
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
elementwise_add
,
ElementwiseAddOpConverter
);
REGISTER_ANAKIN_OP_CONVERTER
(
elementwise_mul
,
ElementwiseMulOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
elet_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ElementwiseAddOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
elet_nv_int8
=
::
paddle
::
inference
::
anakin
::
ElementwiseAddOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
using
eletmul_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ElementwiseMulOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
eletmul_nv_int8
=
::
paddle
::
inference
::
anakin
::
ElementwiseMulOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
elementwise_add
,
elet_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
elementwise_add
,
elet_nv_int8
);
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
elementwise_mul
,
eletmul_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
elementwise_mul
,
eletmul_nv_int8
);
#endif
using
elet_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ElementwiseAddOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
elet_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ElementwiseAddOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
using
eletmul_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ElementwiseMulOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
eletmul_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ElementwiseMulOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
elementwise_add
,
elet_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
elementwise_add
,
elet_cpu_int8
);
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
elementwise_mul
,
eletmul_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
elementwise_mul
,
eletmul_cpu_int8
);
paddle/fluid/inference/anakin/convert/elementwise.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,9 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ElementwiseAddOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ElementwiseAddOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ElementwiseAddOpConverter
()
=
default
;
...
...
@@ -33,7 +35,9 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
private:
};
class
ElementwiseMulOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ElementwiseMulOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ElementwiseMulOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/fc.cc
浏览文件 @
9f1927bd
...
...
@@ -16,23 +16,19 @@
#include <algorithm>
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
FcBaseOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
FcBaseOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
input_names
=
op_desc
.
InputNames
();
bool
with_bias
=
input_names
.
size
()
=
=
3
;
bool
with_bias
=
input_names
.
size
()
>
=
3
;
std
::
string
w_name
=
"Y"
;
std
::
string
i_name
=
"X"
;
...
...
@@ -46,71 +42,74 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
// get weights
auto
*
y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
w_name
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
y_v
);
auto
*
y_t
=
y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
input_name
=
op_desc
.
Input
(
i_name
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
weight_tensor
=
tensor_from_var
(
*
y_v
,
platform
::
CPUPlace
());
auto
weight_shape
=
framework
::
vectorize2int
(
weight_tensor
->
dims
());
engine_
->
AddOp
(
op_name
,
"Dense"
,
{
input_name
},
{
output_name
});
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
with_bias
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
auto
weight_shape
=
framework
::
vectorize2int
(
y_t
->
dims
());
int
out_dim
=
weight_shape
[
1
];
engine_
->
AddOpAttr
(
op_name
,
"out_dim"
,
out_dim
);
const
int
w_m
=
weight_shape
[
0
];
const
int
w_k
=
weight_shape
[
1
];
if
(
weight_shape
.
size
()
<
4UL
)
{
weight_shape
.
insert
(
weight_shape
.
begin
(),
4UL
-
weight_shape
.
size
(),
1
);
}
Shape
anakin_shape
(
weight_shape
);
auto
input_name
=
op_desc
.
Input
(
i_name
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
this
->
engine_
->
AddOp
(
op_name
,
"Dense"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"bias_term"
,
with_bias
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"out_dim"
,
out_dim
);
framework
::
LoDTensor
weight_tensor
;
weight_tensor
.
Resize
(
y_t
->
dims
());
TensorCopySync
((
*
y_t
),
platform
::
CPUPlace
(),
&
weight_tensor
);
auto
*
weight_data
=
weight_tensor
.
data
<
float
>
();
PADDLE_ENFORCE
(
w_m
*
w_k
==
weight_tensor
.
numel
());
auto
*
weight_data
=
weight_tensor
->
data
<
float
>
();
PADDLE_ENFORCE
(
w_m
*
w_k
==
weight_tensor
->
numel
());
std
::
vector
<
float
>
trans_weight_data
(
weight_tensor
.
numel
());
std
::
vector
<
float
>
trans_weight_data
(
weight_tensor
->
numel
());
for
(
int
i
=
0
;
i
<
w_m
;
i
++
)
{
for
(
int
j
=
0
;
j
<
w_k
;
j
++
)
{
trans_weight_data
[
i
+
j
*
w_m
]
=
weight_data
[
i
*
w_k
+
j
];
}
}
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
anakin_shape
);
float
*
cpu_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
trans_weight_data
.
data
(),
weight_tensor
.
numel
(),
cpu_data
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
int
weight_num
=
weight_tensor
->
numel
();
bool
enable_int8
=
boost
::
get
<
bool
>
(
op_desc
.
HasAttr
(
"enable_int8"
));
if
(
enable_int8
)
{
if
(
weight_shape
.
size
()
<
4UL
)
{
weight_shape
.
insert
(
weight_shape
.
begin
(),
4UL
-
weight_shape
.
size
(),
1
);
}
::
anakin
::
saber
::
Shape
anakin_shape
(
weight_shape
);
const
float
int8_range
=
127.
;
float
in_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"input_scale"
));
float
weight_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"weight_scale"
));
PBlock
<
TargetT
>
*
weight1
=
new
PBlock
<
TargetT
>
(
anakin_shape
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
RegistBlock
(
weight1
);
std
::
vector
<
char
>
weight_int8
;
for
(
int
i
=
0
;
i
<
weight_num
;
i
++
)
{
bool
is_valid_int8
=
((
trans_weight_data
[
i
]
>=
-
128
)
&&
(
trans_weight_data
[
i
]
<=
127
));
PADDLE_ENFORCE
(
is_valid_int8
,
"We are in anakin subgraph int8 mode, the weight of fc "
"should be in range [-128, 127]"
);
weight_int8
.
push_back
(
static_cast
<
char
>
(
trans_weight_data
[
i
]));
}
memcpy
(
static_cast
<
void
*>
(
weight1
->
h_tensor
().
mutable_data
()),
static_cast
<
void
*>
(
weight_int8
.
data
()),
sizeof
(
char
)
*
weight_num
);
weight1
->
d_tensor
().
set_shape
(
anakin_shape
);
weight1
->
d_tensor
().
copy_from
(
weight1
->
h_tensor
());
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
this
->
engine_
->
Graph
()
->
SetOpPrec
(
op_name
,
::
anakin
::
AK_INT8
);
this
->
engine_
->
Graph
()
->
SetWeightsScale
(
op_name
,
{
weight_scale
/
int8_range
},
false
);
this
->
engine_
->
AddTensorScale
(
input_name
,
in_scale
/
int8_range
);
}
else
{
auto
*
weight1
=
pblock_from_vector
<
TargetT
,
PrecisionT
>
(
trans_weight_data
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_1"
,
*
weight1
);
}
// get bias
if
(
with_bias
)
{
auto
*
b_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Bias"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
b_v
);
auto
*
b_t
=
b_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
bias_shape
=
framework
::
vectorize2int
(
b_t
->
dims
());
framework
::
LoDTensor
bias_tensor
;
bias_tensor
.
Resize
(
b_t
->
dims
());
TensorCopySync
((
*
b_t
),
platform
::
CPUPlace
(),
&
bias_tensor
);
auto
*
bias_data
=
bias_tensor
.
data
<
float
>
();
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
bias_shape
.
insert
(
bias_shape
.
begin
(),
1
);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape
anakin_bias_shape
(
bias_shape
);
auto
*
weight2
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
anakin_bias_shape
);
float
*
cpu_data2
=
static_cast
<
float
*>
(
weight2
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
bias_data
,
bias_tensor
.
numel
(),
cpu_data2
);
weight2
->
d_tensor
().
set_shape
(
anakin_bias_shape
);
weight2
->
d_tensor
().
copy_from
(
weight2
->
h_tensor
());
engine_
->
AddOpAttr
(
op_name
,
"weight_2"
,
*
weight2
);
auto
weight2
=
pblock_from_var
<
TargetT
,
PrecisionT
>
(
*
b_v
,
this
->
engine_
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"weight_2"
,
*
weight2
);
}
}
...
...
@@ -118,5 +117,40 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
mul
,
MulOpConverter
);
REGISTER_ANAKIN_OP_CONVERTER
(
fc
,
FcOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
mul_nv_fp32
=
::
paddle
::
inference
::
anakin
::
MulOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
fc_nv_fp32
=
::
paddle
::
inference
::
anakin
::
FcOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
mul_nv_int8
=
::
paddle
::
inference
::
anakin
::
MulOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
using
fc_nv_int8
=
::
paddle
::
inference
::
anakin
::
FcOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
mul
,
mul_nv_fp32
);
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
fc
,
fc_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
mul
,
mul_nv_int8
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
fc
,
fc_nv_int8
);
#endif
using
mul_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
MulOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
fc_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
FcOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
mul_cpu_int8
=
::
paddle
::
inference
::
anakin
::
MulOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
using
fc_cpu_int8
=
::
paddle
::
inference
::
anakin
::
FcOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
mul
,
mul_cpu_fp32
);
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
fc
,
fc_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
mul
,
mul_cpu_int8
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
fc
,
fc_cpu_int8
);
paddle/fluid/inference/anakin/convert/fc.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
FcBaseOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
FcBaseOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
FcBaseOpConverter
()
=
default
;
...
...
@@ -32,13 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter {
};
// with bias
class
FcOpConverter
:
public
FcBaseOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
FcOpConverter
:
public
FcBaseOpConverter
<
TargetT
,
PrecisionT
>
{
public:
FcOpConverter
()
=
default
;
};
// without bias
class
MulOpConverter
:
public
FcBaseOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
MulOpConverter
:
public
FcBaseOpConverter
<
TargetT
,
PrecisionT
>
{
public:
MulOpConverter
()
=
default
;
};
...
...
paddle/fluid/inference/anakin/convert/flatten.cc
浏览文件 @
9f1927bd
...
...
@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/flatten.h"
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
FlattenOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
FlattenOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1UL
);
...
...
@@ -41,12 +37,30 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
std
::
vector
<
int
>
out_dims
=
{
0
,
-
1
,
1
,
1
};
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Reshape"
,
{
input
},
{
output
});
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dims"
,
out_dims
);
this
->
engine_
->
AddOp
(
op_name
,
"Reshape"
,
{
input
},
{
output
});
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dims"
,
out_dims
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
flatten
,
FlattenOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
flatten_nv_fp32
=
::
paddle
::
inference
::
anakin
::
FlattenOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
flatten_nv_int8
=
::
paddle
::
inference
::
anakin
::
FlattenOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
flatten
,
flatten_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
flatten
,
flatten_nv_int8
);
#endif
using
flatten_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
FlattenOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
flatten_cpu_int8
=
::
paddle
::
inference
::
anakin
::
FlattenOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
flatten
,
flatten_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
flatten
,
flatten_cpu_int8
);
paddle/fluid/inference/anakin/convert/flatten.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
FlattenOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
FlattenOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
FlattenOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/helper.cc
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
std
::
unique_ptr
<
framework
::
LoDTensor
>
tensor_from_var
(
const
framework
::
Variable
&
var
,
const
platform
::
Place
&
place
)
{
auto
&
src
=
var
.
Get
<
framework
::
LoDTensor
>
();
std
::
unique_ptr
<
framework
::
LoDTensor
>
dst
(
new
framework
::
LoDTensor
());
dst
->
Resize
(
src
.
dims
());
TensorCopySync
((
src
),
place
,
dst
.
get
());
return
dst
;
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/convert/helper.h
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/inference/anakin/engine.h"
#include "framework/core/net/net.h"
#include "framework/core/types.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "saber/saber_types.h"
using
anakin
::
saber
::
Shape
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
AK_INT8
;
using
anakin
::
PBlock
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
std
::
unique_ptr
<
framework
::
LoDTensor
>
tensor_from_var
(
const
framework
::
Variable
&
var
,
const
platform
::
Place
&
place
);
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
PBlock
<
TargetT
>*
pblock_from_tensor
(
const
framework
::
LoDTensor
&
tensor
,
std
::
vector
<
int
>
shape_vec
,
AnakinEngine
<
TargetT
,
PrecisionT
>*
engine
)
{
while
(
shape_vec
.
size
()
<
4
)
{
shape_vec
.
insert
(
shape_vec
.
begin
(),
1
);
}
Shape
shape
(
shape_vec
);
PBlock
<
TargetT
>*
weight
=
new
PBlock
<
TargetT
>
(
shape
,
AK_FLOAT
);
engine
->
RegistBlock
(
weight
);
float
*
cpu_data
=
static_cast
<
float
*>
(
weight
->
h_tensor
().
mutable_data
());
std
::
copy_n
(
tensor
.
data
<
float
>
(),
tensor
.
numel
(),
cpu_data
);
weight
->
d_tensor
().
set_shape
(
shape
);
weight
->
d_tensor
().
copy_from
(
weight
->
h_tensor
());
return
weight
;
}
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
PBlock
<
TargetT
>*
pblock_from_vector
(
const
std
::
vector
<
float
>&
vec
,
std
::
vector
<
int
>
shape_vec
,
AnakinEngine
<
TargetT
,
PrecisionT
>*
engine
)
{
while
(
shape_vec
.
size
()
<
4
)
{
shape_vec
.
insert
(
shape_vec
.
begin
(),
1
);
}
Shape
shape
(
shape_vec
);
PBlock
<
TargetT
>*
weight
=
new
PBlock
<
TargetT
>
(
shape
,
AK_FLOAT
);
engine
->
RegistBlock
(
weight
);
auto
*
weight_data
=
static_cast
<
float
*>
(
weight
->
h_tensor
().
mutable_data
());
std
::
copy
(
std
::
begin
(
vec
),
std
::
end
(
vec
),
weight_data
);
weight
->
d_tensor
().
set_shape
(
shape
);
weight
->
d_tensor
().
copy_from
(
weight
->
h_tensor
());
return
weight
;
}
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
PBlock
<
TargetT
>*
pblock_from_vector
(
const
std
::
vector
<
float
>&
vec
,
AnakinEngine
<
TargetT
,
PrecisionT
>*
engine
)
{
int
size
=
vec
.
size
();
return
pblock_from_vector
<
TargetT
,
PrecisionT
>
(
vec
,
std
::
vector
<
int
>
({
1
,
1
,
1
,
size
}),
engine
);
}
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
PBlock
<
TargetT
>*
pblock_from_var
(
const
framework
::
Variable
&
var
,
AnakinEngine
<
TargetT
,
PrecisionT
>*
engine
)
{
auto
tensor
=
tensor_from_var
(
var
,
platform
::
CPUPlace
());
auto
shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
return
pblock_from_tensor
<
TargetT
,
PrecisionT
>
(
*
tensor
,
shape
,
engine
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/convert/im2sequence.cc
浏览文件 @
9f1927bd
...
...
@@ -17,23 +17,16 @@
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
Im2SequenceConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
Im2SequenceConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Y"
).
size
(),
0
);
...
...
@@ -43,21 +36,37 @@ void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
auto
out_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Im2Sequence"
,
{
x_name
},
{
out_name
});
this
->
engine_
->
AddOp
(
op_name
,
"Im2Sequence"
,
{
x_name
},
{
out_name
});
std
::
vector
<
int
>
dilations
=
{
1
,
1
};
auto
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
auto
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
auto
kernels
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"kernels"
));
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"paddings"
,
paddings
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"strides"
,
strides
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"window_size"
,
kernels
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dilations"
,
dilations
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"paddings"
,
paddings
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"strides"
,
strides
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"window_size"
,
kernels
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dilations"
,
dilations
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
im2sequence
,
Im2SequenceConverter
);
#ifdef PADDLE_WITH_CUDA
using
im2sequence_nv_fp32
=
::
paddle
::
inference
::
anakin
::
Im2SequenceConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
im2sequence_nv_int8
=
::
paddle
::
inference
::
anakin
::
Im2SequenceConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
im2sequence
,
im2sequence_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
im2sequence
,
im2sequence_nv_int8
);
#endif
using
im2sequence_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
Im2SequenceConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
im2sequence_cpu_int8
=
::
paddle
::
inference
::
anakin
::
Im2SequenceConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
im2sequence
,
im2sequence_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
im2sequence
,
im2sequence_cpu_int8
);
paddle/fluid/inference/anakin/convert/im2sequence.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
Im2SequenceConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
Im2SequenceConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
Im2SequenceConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/op_converter.h
浏览文件 @
9f1927bd
...
...
@@ -32,10 +32,10 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
using
AnakinNvEngine
=
AnakinEngine
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
AnakinOpConverter
{
using
AnakinEngineT
=
AnakinEngine
<
TargetT
,
PrecisionT
>
;
public:
AnakinOpConverter
()
=
default
;
...
...
@@ -45,7 +45,7 @@ class AnakinOpConverter {
void
ConvertOp
(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
std
::
unordered_set
<
std
::
string
>
&
parameters
,
const
framework
::
Scope
&
scope
,
Anakin
NvEngine
*
engine
,
const
framework
::
Scope
&
scope
,
Anakin
EngineT
*
engine
,
bool
test_mode
=
false
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
std
::
string
op_type
=
op_desc
.
Type
();
...
...
@@ -65,7 +65,7 @@ class AnakinOpConverter {
void
ConvertBlock
(
framework
::
BlockDesc
*
block_desc
,
const
std
::
unordered_set
<
std
::
string
>
&
parameters
,
const
framework
::
Scope
&
scope
,
Anakin
NvEngine
*
engine
)
{
const
framework
::
Scope
&
scope
,
Anakin
EngineT
*
engine
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
framework
::
proto
::
BlockDesc
*
block
=
block_desc
->
Proto
();
for
(
auto
i
=
0
;
i
<
block
->
ops_size
();
i
++
)
{
...
...
@@ -79,9 +79,8 @@ class AnakinOpConverter {
framework
::
BlockDesc
*
block_desc
,
framework
::
Scope
*
scope
,
const
std
::
vector
<
std
::
string
>
&
inputs
,
const
std
::
unordered_set
<
std
::
string
>
&
parameters
,
const
std
::
vector
<
std
::
string
>
&
outputs
,
Anakin
NvEngine
*
engine
)
{
const
std
::
vector
<
std
::
string
>
&
outputs
,
Anakin
EngineT
*
engine
)
{
ConvertBlock
(
block_desc
,
parameters
,
*
scope
,
engine
);
engine
->
Freeze
();
// if the max_batch size
int
max_batch_size
=
engine
->
GetMaxBatchSize
();
PADDLE_ENFORCE
(
max_batch_size
>
0
,
...
...
@@ -91,6 +90,18 @@ class AnakinOpConverter {
// the block_desc.
auto
max_input_shape
=
engine
->
GetMaxInputShape
();
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
temp_max_input_shape
;
// Register outputs with anakin using the RegistVar interface before Freeze.
// Note that RegistVar's parameters can only be outputs, not inputs.
for
(
auto
&
output
:
outputs
)
{
engine
->
Graph
()
->
RegistVar
(
output
);
}
engine
->
Freeze
();
// Add scale for tensor in int8 mode.
auto
tensor_scales
=
engine
->
GetTensorScales
();
for
(
auto
&
item
:
tensor_scales
)
{
engine
->
Graph
()
->
SetVarScale
(
item
.
first
,
item
.
second
);
}
for
(
auto
&
input
:
inputs
)
{
if
(
parameters
.
count
(
input
))
continue
;
...
...
@@ -99,7 +110,7 @@ class AnakinOpConverter {
input_shape
[
0
]
=
max_batch_size
;
if
(
max_input_shape
.
count
(
input
))
{
PADDLE_ENFORCE
(
max_input_shape
[
input
].
size
()
==
4
,
"the dimensions of
max_input_shape setted from "
"the dimensions of max_input_shape setted from "
"config->EnableAnakinEngine must be 4"
);
for
(
int
i
=
1
;
i
<
4
;
i
++
)
{
input_shape
[
i
]
=
max_input_shape
[
input
][
i
];
...
...
@@ -118,50 +129,92 @@ class AnakinOpConverter {
}
temp_max_input_shape
[
input
]
=
input_shape
;
engine
->
SetInputShape
(
input
,
input_shape
);
engine
->
Graph
()
->
RegistVar
(
input
);
// For share from data.
}
engine
->
SetMaxInputShape
(
temp_max_input_shape
);
engine
->
Optimize
();
// For anakin share with fluid tensor.
engine
->
AllocTmpMem
();
engine
->
InitGraph
();
engine
->
InitNet
();
}
void
SetEngine
(
Anakin
NvEngine
*
engine
)
{
engine_
=
engine
;
}
void
SetEngine
(
Anakin
EngineT
*
engine
)
{
engine_
=
engine
;
}
virtual
~
AnakinOpConverter
()
{}
protected:
bool
test_mode_
;
Anakin
NvEngine
*
engine_
{
nullptr
};
Anakin
EngineT
*
engine_
{
nullptr
};
private:
std
::
unordered_map
<
std
::
string
,
AnakinOpConverter
*>
converters_
;
std
::
unordered_map
<
std
::
string
,
AnakinOpConverter
<
TargetT
,
PrecisionT
>
*>
converters_
;
framework
::
Scope
*
scope_
{
nullptr
};
std
::
mutex
mutex_
;
};
template
class
AnakinOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>;
template
class
AnakinOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>;
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
struct anakin_##op_type__##_converter \
: public ::paddle::framework::Registrar { \
anakin_##op_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__; \
::paddle::inference::Registry< \
::paddle::inference::anakin::AnakinOpConverter>::Global() \
.Register<::paddle::inference::anakin::Converter__>(#op_type__); \
} \
}; \
anakin_##op_type__##_converter anakin_##op_type__##_converter__; \
int TouchConverterRegister_anakin_##op_type__() { \
anakin_##op_type__##_converter__.Touch(); \
return 0; \
#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \
place_type__, place_class__, \
precision_type__, precision_class__) \
struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter \
: public ::paddle::framework::Registrar { \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__ << " "; \
::paddle::inference::Registry< \
::paddle::inference::anakin::AnakinOpConverter< \
place_class__, precision_class__>>::Global() \
.Register<Converter__>(#op_type__); \
} \
}; \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter__; \
int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() { \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter__ \
.Touch(); \
return 0; \
}
#define USE_ANAKIN_CONVERTER(op_type__) \
extern int TouchConverterRegister_anakin_##op_type__(); \
int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__();
#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV, FP32, \
::anakin::Precision::FP32)
#define REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV, INT8, \
::anakin::Precision::INT8)
#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86, FP32, \
::anakin::Precision::FP32)
#define REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86, INT8, \
::anakin::Precision::INT8)
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \
extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \
__attribute__((unused)) = \
Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
#define USE_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
#define USE_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
paddle/fluid/inference/anakin/convert/pool2d.cc
浏览文件 @
9f1927bd
...
...
@@ -17,23 +17,16 @@
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
Pool2dOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
Pool2dOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -65,17 +58,35 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_THROW
(
"TensorRT unsupported pooling type!"
);
}
engine_
->
AddOp
(
op_name
,
"Pooling"
,
{
x_name
},
{
y_name
});
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"pool_size"
,
ksize
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"strides"
,
strides
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"padding"
,
paddings
);
engine_
->
AddOpAttr
(
op_name
,
"method"
,
anakin_pool_type
);
engine_
->
AddOpAttr
(
op_name
,
"global_pooling"
,
global_pooling
);
engine_
->
AddOpAttr
(
op_name
,
"cmp_out_shape_floor_as_conv"
,
!
ceil_mode
);
this
->
engine_
->
AddOp
(
op_name
,
"Pooling"
,
{
x_name
},
{
y_name
});
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"pool_size"
,
ksize
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"strides"
,
strides
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"padding"
,
paddings
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"method"
,
anakin_pool_type
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"global_pooling"
,
global_pooling
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"cmp_out_shape_floor_as_conv"
,
!
ceil_mode
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
pool2d
,
Pool2dOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
pool2d_nv_float32
=
::
paddle
::
inference
::
anakin
::
Pool2dOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
pool2d_nv_int8
=
::
paddle
::
inference
::
anakin
::
Pool2dOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
pool2d
,
pool2d_nv_float32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
pool2d
,
pool2d_nv_int8
);
#endif
using
pool2d_cpu_float32
=
::
paddle
::
inference
::
anakin
::
Pool2dOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
pool2d_cpu_int8
=
::
paddle
::
inference
::
anakin
::
Pool2dOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
pool2d
,
pool2d_cpu_float32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
pool2d
,
pool2d_cpu_int8
);
paddle/fluid/inference/anakin/convert/pool2d.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
Pool2dOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
Pool2dOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
Pool2dOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/relu.cc
浏览文件 @
9f1927bd
...
...
@@ -16,19 +16,30 @@
#include <algorithm>
#include <map>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
ReluOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ReluOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
auto
input_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
this
->
engine_
->
AddOp
(
op_name
,
"ReLU"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"alpha"
,
0
);
}
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
LeakyReluOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -37,12 +48,45 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
auto
input_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"ReLU"
,
{
input_name
},
{
output_name
});
engine_
->
AddOpAttr
(
op_name
,
"alpha"
,
0
);
float
alpha
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"alpha"
));
this
->
engine_
->
AddOp
(
op_name
,
"ReLU"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"alpha"
,
alpha
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
relu
,
ReluOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
relu_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ReluOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
leaky_nv_fp32
=
::
paddle
::
inference
::
anakin
::
LeakyReluOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
relu_nv_int8
=
::
paddle
::
inference
::
anakin
::
ReluOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
using
leaky_nv_int8
=
::
paddle
::
inference
::
anakin
::
LeakyReluOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
relu
,
relu_nv_fp32
);
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
leaky_relu
,
leaky_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
relu
,
relu_nv_int8
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
leaky_relu
,
leaky_nv_int8
);
#endif
using
relu_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ReluOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
leaky_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
LeakyReluOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
relu_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ReluOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
using
leaky_cpu_int8
=
::
paddle
::
inference
::
anakin
::
LeakyReluOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
relu
,
relu_cpu_fp32
);
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
leaky_relu
,
leaky_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
relu
,
relu_cpu_int8
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
leaky_relu
,
leaky_cpu_int8
);
paddle/fluid/inference/anakin/convert/relu.h
浏览文件 @
9f1927bd
...
...
@@ -22,7 +22,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ReluOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ReluOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ReluOpConverter
()
=
default
;
...
...
@@ -33,6 +34,18 @@ class ReluOpConverter : public AnakinOpConverter {
virtual
~
ReluOpConverter
()
{}
};
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
LeakyReluOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
LeakyReluOpConverter
()
=
default
;
virtual
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
;
virtual
~
LeakyReluOpConverter
()
{}
};
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/convert/reshape.cc
浏览文件 @
9f1927bd
...
...
@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/reshape.h"
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
ReshapeOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ReshapeOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1UL
);
...
...
@@ -37,17 +33,35 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
auto
output
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Reshape"
,
{
input
},
{
output
});
this
->
engine_
->
AddOp
(
op_name
,
"Reshape"
,
{
input
},
{
output
});
auto
shape
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"shape"
));
if
(
shape
.
size
()
<
4
)
{
shape
.
insert
(
shape
.
end
(),
4
-
shape
.
size
(),
1
);
}
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dims"
,
shape
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dims"
,
shape
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
reshape
,
ReshapeOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
reshape_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ReshapeOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
reshape_nv_int8
=
::
paddle
::
inference
::
anakin
::
ReshapeOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
reshape
,
reshape_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
reshape
,
reshape_nv_int8
);
#endif
using
reshape_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ReshapeOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
reshape_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ReshapeOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
reshape
,
reshape_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
reshape
,
reshape_cpu_int8
);
paddle/fluid/inference/anakin/convert/reshape.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ReshapeOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ReshapeOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ReshapeOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/roi_align.cc
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/roi_align.h"
#include <algorithm>
#include <map>
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
RoiAlignOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"ROIs"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
auto
input_x_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
input_rois_name
=
op_desc
.
Input
(
"ROIs"
).
front
();
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
auto
spatial_scale
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"spatial_scale"
));
auto
pooled_height
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"pooled_height"
));
auto
pooled_width
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"pooled_width"
));
auto
sampling_ratio
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"sampling_ratio"
));
this
->
engine_
->
AddOp
(
op_name
,
"RoiAlign"
,
{
input_x_name
,
input_rois_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"spatial_scale"
,
spatial_scale
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"pooled_height"
,
pooled_height
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"pooled_width"
,
pooled_width
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"sampling_ratio"
,
sampling_ratio
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
#ifdef PADDLE_WITH_CUDA
using
roi_align_nv_fp32
=
::
paddle
::
inference
::
anakin
::
RoiAlignOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
roi_align_nv_int8
=
::
paddle
::
inference
::
anakin
::
RoiAlignOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
roi_align
,
roi_align_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
roi_align
,
roi_align_nv_int8
);
#endif
using
roi_align_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
RoiAlignOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
roi_align_cpu_int8
=
::
paddle
::
inference
::
anakin
::
RoiAlignOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
roi_align
,
roi_align_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
roi_align
,
roi_align_cpu_int8
);
paddle/fluid/inference/anakin/convert/roi_align.h
0 → 100644
浏览文件 @
9f1927bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
RoiAlignOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
RoiAlignOpConverter
()
=
default
;
virtual
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
;
virtual
~
RoiAlignOpConverter
()
{}
};
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/convert/scale.cc
浏览文件 @
9f1927bd
...
...
@@ -16,19 +16,14 @@
#include <algorithm>
#include <map>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
ScaleOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
ScaleOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -44,14 +39,32 @@ void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_ENFORCE
(
bias_after_scale
,
"The anakin scale layer only support bias after scale now."
);
engine_
->
AddOp
(
op_name
,
"Power"
,
{
input_name
},
{
output_name
});
engine_
->
AddOpAttr
(
op_name
,
"shift"
,
bias
);
engine_
->
AddOpAttr
(
op_name
,
"scale"
,
scale
);
engine_
->
AddOpAttr
(
op_name
,
"power"
,
static_cast
<
float
>
(
1.0
));
this
->
engine_
->
AddOp
(
op_name
,
"Power"
,
{
input_name
},
{
output_name
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"shift"
,
bias
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"scale"
,
scale
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"power"
,
static_cast
<
float
>
(
1.0
));
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
scale
,
ScaleOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
scale_nv_fp32
=
::
paddle
::
inference
::
anakin
::
ScaleOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
scale_nv_int8
=
::
paddle
::
inference
::
anakin
::
ScaleOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
scale
,
scale_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
scale
,
scale_nv_int8
);
#endif
using
scale_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
ScaleOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
scale_cpu_int8
=
::
paddle
::
inference
::
anakin
::
ScaleOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
scale
,
scale_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
scale
,
scale_cpu_int8
);
paddle/fluid/inference/anakin/convert/scale.h
浏览文件 @
9f1927bd
...
...
@@ -22,7 +22,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
ScaleOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
ScaleOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
ScaleOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/softmax.cc
浏览文件 @
9f1927bd
...
...
@@ -14,19 +14,14 @@
#include "paddle/fluid/inference/anakin/convert/softmax.h"
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
SoftMaxOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
SoftMaxOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1UL
);
...
...
@@ -41,12 +36,31 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto
input_shape_in_fluid
=
input_var_desc
->
GetShape
();
size_t
input_dims
=
input_shape_in_fluid
.
size
();
engine_
->
AddOp
(
op_name
,
"Softmax"
,
{
input
},
{
output
});
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
static_cast
<
int
>
(
input_dims
-
1
));
this
->
engine_
->
AddOp
(
op_name
,
"Softmax"
,
{
input
},
{
output
});
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
static_cast
<
int
>
(
input_dims
-
1
));
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
softmax
,
SoftMaxOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
sm_nv_fp32
=
::
paddle
::
inference
::
anakin
::
SoftMaxOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
sm_nv_int8
=
::
paddle
::
inference
::
anakin
::
SoftMaxOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
softmax
,
sm_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
softmax
,
sm_nv_int8
);
#endif
using
sm_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
SoftMaxOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
sm_cpu_int8
=
::
paddle
::
inference
::
anakin
::
SoftMaxOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
softmax
,
sm_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
softmax
,
sm_cpu_int8
);
paddle/fluid/inference/anakin/convert/softmax.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
SoftMaxOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
SoftMaxOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
SoftMaxOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/split.cc
浏览文件 @
9f1927bd
...
...
@@ -16,23 +16,16 @@
#include <algorithm>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
SplitOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
SplitOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
input_name
=
op_desc
.
Input
(
"X"
).
front
();
auto
y_names
=
op_desc
.
Output
(
"Out"
);
...
...
@@ -51,14 +44,34 @@ void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
num_sum
+=
output_lengths
[
i
];
slice_point
.
push_back
(
num_sum
);
}
engine_
->
AddOp
(
op_name
,
"Slice"
,
{
input_name
},
y_names
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
axis
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"slice_point"
,
slice_point
);
this
->
engine_
->
AddOp
(
op_name
,
"Slice"
,
{
input_name
},
y_names
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
axis
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"slice_point"
,
slice_point
);
// slice_dim is useless in anakin
engine_
->
AddOpAttr
(
op_name
,
"slice_dim"
,
4
);
this
->
engine_
->
AddOpAttr
(
op_name
,
"slice_dim"
,
4
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
split
,
SplitOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
split_nv_fp32
=
::
paddle
::
inference
::
anakin
::
SplitOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
split_nv_int8
=
::
paddle
::
inference
::
anakin
::
SplitOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
split
,
split_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
split
,
split_nv_int8
);
#endif
using
split_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
SplitOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
split_cpu_int8
=
::
paddle
::
inference
::
anakin
::
SplitOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
split
,
split_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
split
,
split_cpu_int8
);
paddle/fluid/inference/anakin/convert/split.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
SplitOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
SplitOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
SplitOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/sum.cc
浏览文件 @
9f1927bd
...
...
@@ -17,22 +17,16 @@
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
SumOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
SumOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
2
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -43,13 +37,32 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op,
std
::
vector
<
float
>
coeff
=
{
1
,
1
};
std
::
string
elementwise_type
=
"Add"
;
engine_
->
AddOp
(
op_name
,
"Eltwise"
,
input_names
,
{
out_name
});
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"coeff"
,
coeff
);
engine_
->
AddOpAttr
<
std
::
string
>
(
op_name
,
"type"
,
elementwise_type
);
this
->
engine_
->
AddOp
(
op_name
,
"Eltwise"
,
input_names
,
{
out_name
});
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
float
>
>
(
op_name
,
"coeff"
,
coeff
);
this
->
engine_
->
template
AddOpAttr
<
std
::
string
>(
op_name
,
"type"
,
elementwise_type
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
sum
,
SumOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
sum_nv_fp32
=
::
paddle
::
inference
::
anakin
::
SumOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
sum_nv_int8
=
::
paddle
::
inference
::
anakin
::
SumOpConverter
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
sum
,
sum_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
sum
,
sum_nv_int8
);
#endif
using
sum_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
SumOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
sum_cpu_int8
=
::
paddle
::
inference
::
anakin
::
SumOpConverter
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
sum
,
sum_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
sum
,
sum_cpu_int8
);
paddle/fluid/inference/anakin/convert/sum.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
SumOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
SumOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
SumOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/test_activation_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,12 +21,14 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
static
void
test_activation_op
(
const
std
::
string
&
op_type
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
op_type
);
PADDLE_ENFORCE
(
converter
!=
nullptr
);
template
<
typename
TargetT
>
static
void
test_activation_op
(
const
std
::
string
&
op_type
,
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"act-X"
,
{
10
,
6
,
1
,
1
});
validator
.
DeclOutputVar
(
"act-Out"
,
{
10
,
6
,
1
,
1
});
framework
::
OpDesc
desc
;
...
...
@@ -41,13 +43,44 @@ static void test_activation_op(const std::string &op_type) {
validator
.
Execute
(
5
);
}
TEST
(
sigm_op
,
test
)
{
test_activation_op
(
"sigmoid"
);
}
TEST
(
tanh_op
,
test
)
{
test_activation_op
(
"tanh"
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
sigm_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_activation_op
<::
anakin
::
saber
::
NV
>
(
"sigmoid"
,
ctx
,
true
);
}
TEST
(
tanh_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_activation_op
<::
anakin
::
saber
::
NV
>
(
"tanh"
,
ctx
,
true
);
}
#endif
/*
TEST(sigm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
}
TEST(tanh_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
}
*/
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
sigmoid
);
USE_OP
(
tanh
);
USE_CPU_ANAKIN_CONVERTER
(
sigmoid
);
USE_CPU_ANAKIN_CONVERTER
(
tanh
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
sigmoid
);
USE_ANAKIN_CONVERTER
(
tanh
);
#endif
paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
0 → 100644
浏览文件 @
9f1927bd
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/affine_channel.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
template
<
typename
TargetT
>
void
test_affine_channel_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
// Declare the difference between the inputs.
std
::
unordered_set
<
std
::
string
>
parameters
({
"scale"
,
"bias"
});
framework
::
Scope
scope
;
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"x"
,
{
1
,
3
,
5
,
2
});
validator
.
DeclOutputVar
(
"out"
,
{
1
,
3
,
5
,
2
});
validator
.
DeclParamVar
(
"scale"
,
{
3
});
validator
.
DeclParamVar
(
"bias"
,
{
3
});
// Prepare Op descriptions.
framework
::
OpDesc
desc
;
desc
.
SetType
(
"affine_channel"
);
desc
.
SetInput
(
"X"
,
{
"x"
});
desc
.
SetInput
(
"Bias"
,
{
"bias"
});
desc
.
SetInput
(
"Scale"
,
{
"scale"
});
desc
.
SetOutput
(
"Out"
,
{
"out"
});
// Layout must be explicitly specified here as NCHW.
desc
.
SetAttr
(
"data_layout"
,
std
::
string
(
"NCHW"
));
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
affine_channel_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_affine_channel_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
affine_channel_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_affine_channel_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
affine_channel
);
USE_CPU_ANAKIN_CONVERTER
(
affine_channel
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
affine_channel
);
#endif
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
浏览文件 @
9f1927bd
...
...
@@ -19,12 +19,14 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
batch_norm_op
,
test
)
{
template
<
typename
TargetT
>
void
test_batchnorm_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
(
{
"batch_norm_scale"
,
"batch_norm_bias"
,
"batch_norm_mean"
,
"batch_norm_variance"
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
std
::
vector
<
int
>
param_shape
{
2
};
validator
.
DeclInputVar
(
"batch_norm_X"
,
{
1
,
2
,
5
,
5
});
...
...
@@ -64,8 +66,26 @@ TEST(batch_norm_op, test) {
validator
.
Execute
(
1
,
neglected_output
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
batch_norm_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_batchnorm_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
batch_norm_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_batchnorm_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
batch_norm
);
USE_CPU_ANAKIN_CONVERTER
(
batch_norm
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
batch_norm
);
#endif
paddle/fluid/inference/anakin/convert/test_concat_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,10 +21,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
concat_op
,
test
)
{
template
<
typename
TargetT
>
void
test_concat_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"concat_x1"
,
{
1
,
2
,
1
,
1
});
validator
.
DeclInputVar
(
"concat_x2"
,
{
1
,
3
,
1
,
1
});
validator
.
DeclInputVar
(
"concat_x3"
,
{
1
,
1
,
1
,
1
});
...
...
@@ -44,31 +46,26 @@ TEST(concat_op, test) {
validator
.
Execute
(
1
);
}
TEST
(
concat_op
,
test2
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
validator
.
DeclInputVar
(
"concat_x1"
,
{
1
,
4
});
validator
.
DeclInputVar
(
"concat_x2"
,
{
3
,
4
});
validator
.
DeclInputVar
(
"concat_x3"
,
{
2
,
4
});
validator
.
DeclOutputVar
(
"concat_out"
,
{
6
,
4
});
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"concat"
);
desc
.
SetInput
(
"X"
,
{
"concat_x1"
,
"concat_x2"
,
"concat_x3"
});
desc
.
SetOutput
(
"Out"
,
{
"concat_out"
});
int
axis
=
0
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
#ifdef PADDLE_WITH_CUDA
TEST
(
concat_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_concat_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
validator
.
Execute
(
1
);
TEST
(
concat_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_concat_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
concat
);
USE_CPU_ANAKIN_CONVERTER
(
concat
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
concat
);
#endif
paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,13 +21,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
conv2d_op
,
test
)
{
auto
*
conv2d_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"conv2d"
);
ASSERT_TRUE
(
conv2d_converter
!=
nullptr
);
template
<
typename
TargetT
>
void
test_conv2d_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"conv2d-Y"
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"conv2d-X"
,
{
1
,
3
,
3
,
3
});
validator
.
DeclParamVar
(
"conv2d-Y"
,
{
4
,
3
,
1
,
1
});
validator
.
DeclOutputVar
(
"conv2d-Out"
,
{
1
,
4
,
3
,
3
});
...
...
@@ -54,9 +53,27 @@ TEST(conv2d_op, test) {
validator
.
Execute
(
3
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
conv2d_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_conv2d_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
conv2d_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_conv2d_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
conv2d
);
USE_CPU_ANAKIN_CONVERTER
(
conv2d
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
conv2d
);
#endif
paddle/fluid/inference/anakin/convert/test_dropout_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,10 +21,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
dropout_op
,
native
)
{
template
<
typename
TargetT
>
void
test_dropout_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"x"
,
{
1
,
1
,
2
,
2
});
validator
.
DeclOutputVar
(
"out"
,
{
1
,
1
,
2
,
2
});
validator
.
DeclOutputVar
(
"mask"
,
{
1
,
1
,
2
,
2
});
...
...
@@ -45,9 +47,26 @@ TEST(dropout_op, native) {
validator
.
Execute
(
1
,
neglected_output
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
dropout_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_dropout_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
dropout_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_dropout_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
dropout
);
USE_CPU_ANAKIN_CONVERTER
(
dropout
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
dropout
);
#endif
paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,10 +21,14 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
static
void
test_elementwise_op
(
const
std
::
string
&
op_type
)
{
template
<
typename
TargetT
>
static
void
test_elementwise_op
(
const
std
::
string
&
op_type
,
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"x"
,
{
1
,
1
,
2
,
2
});
validator
.
DeclInputVar
(
"y"
,
{
1
,
1
,
2
,
2
});
validator
.
DeclOutputVar
(
"out"
,
{
1
,
1
,
2
,
2
});
...
...
@@ -43,14 +47,41 @@ static void test_elementwise_op(const std::string &op_type) {
validator
.
Execute
(
1
);
}
TEST
(
elementwise_op
,
native_add
)
{
test_elementwise_op
(
"elementwise_add"
);
}
TEST
(
elementwise_op
,
native_mul
)
{
test_elementwise_op
(
"elementwise_mul"
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
elementwise_op
,
native_add_gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_elementwise_op
<::
anakin
::
saber
::
NV
>
(
"elementwise_add"
,
ctx
,
true
);
}
TEST
(
elementwise_op
,
native_mul_gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_elementwise_op
<::
anakin
::
saber
::
NV
>
(
"elementwise_mul"
,
ctx
,
true
);
}
#endif
TEST
(
elementwise_op
,
native_add_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_elementwise_op
<::
anakin
::
saber
::
X86
>
(
"elementwise_add"
,
ctx
,
false
);
}
TEST
(
elementwise_op
,
native_mul_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_elementwise_op
<::
anakin
::
saber
::
X86
>
(
"elementwise_mul"
,
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
elementwise_add
);
USE_ANAKIN_CONVERTER
(
elementwise_add
);
USE_OP
(
elementwise_mul
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
elementwise_add
);
USE_ANAKIN_CONVERTER
(
elementwise_mul
);
#endif
USE_CPU_ANAKIN_CONVERTER
(
elementwise_add
);
USE_CPU_ANAKIN_CONVERTER
(
elementwise_mul
);
paddle/fluid/inference/anakin/convert/test_fc_op.cc
浏览文件 @
9f1927bd
...
...
@@ -20,13 +20,13 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
fc_op
,
test
)
{
auto
*
fc_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"fc"
);
ASSERT_TRUE
(
fc_converter
);
template
<
typename
TargetT
>
void
test_mul_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"mul_y"
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"mul_x"
,
{
1
,
1
,
2
,
2
});
validator
.
DeclParamVar
(
"mul_y"
,
{
4
,
2
});
validator
.
DeclOutputVar
(
"mul_out"
,
{
1
,
2
});
...
...
@@ -42,9 +42,26 @@ TEST(fc_op, test) {
validator
.
Execute
(
10
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
mul_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_mul_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
mul_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_mul_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
mul
);
USE_CPU_ANAKIN_CONVERTER
(
fc
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
fc
);
#endif
paddle/fluid/inference/anakin/convert/test_flatten_op.cc
浏览文件 @
9f1927bd
...
...
@@ -20,13 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
flatten_op
,
test
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"flatten"
);
ASSERT_TRUE
(
converter
);
template
<
typename
TargetT
>
void
test_flatten_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"flatten-X"
,
{
3
,
10
,
10
,
4
});
validator
.
DeclOutputVar
(
"flatten-Out"
,
{
3
,
400
,
1
,
1
});
framework
::
OpDesc
desc
;
...
...
@@ -42,10 +41,27 @@ TEST(flatten_op, test) {
validator
.
Execute
(
5
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
flatten_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_flatten_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
flatten_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_flatten_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
reshape
);
USE_OP_ITSELF
(
flatten
);
USE_CPU_ANAKIN_CONVERTER
(
flatten
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
flatten
);
#endif
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
浏览文件 @
9f1927bd
...
...
@@ -19,15 +19,14 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
void
test_pool2d
(
bool
global_pooling
,
bool
ceil_mode
,
template
<
typename
TargetT
>
void
test_pool2d
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
,
bool
global_pooling
,
bool
ceil_mode
,
std
::
string
pool_type
=
"max"
)
{
auto
*
pool2d_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"pool2d"
);
ASSERT_TRUE
(
pool2d_converter
);
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
...
...
@@ -64,56 +63,61 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
validator
.
Execute
(
1
);
}
void
test_pool2d2
(
bool
global_pooling
,
bool
ceil_mode
,
std
::
string
pool_type
=
"max"
)
{
auto
*
pool2d_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"pool2d"
);
ASSERT_TRUE
(
pool2d_converter
);
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator
.
DeclInputVar
(
"pool2d_x"
,
{
1
,
1
,
17
,
17
});
validator
.
DeclOutputVar
(
"pool2d_out"
,
{
1
,
1
,
17
,
17
});
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"pool2d"
);
desc
.
SetInput
(
"X"
,
{
"pool2d_x"
});
desc
.
SetOutput
(
"Out"
,
{
"pool2d_out"
});
std
::
vector
<
int
>
ksize
({
3
,
3
});
std
::
vector
<
int
>
strides
({
1
,
1
});
std
::
vector
<
int
>
paddings
({
1
,
1
});
std
::
string
pooling_t
=
pool_type
;
#ifdef PADDLE_WITH_CUDA
TEST
(
Pool2dOpConverter
,
normal
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_pool2d
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
,
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_pool2d
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
,
true
,
false
);
}
desc
.
SetAttr
(
"pooling_type"
,
pooling_t
);
desc
.
SetAttr
(
"ksize"
,
ksize
);
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"global_pooling"
,
global_pooling
);
desc
.
SetAttr
(
"ceil_mode"
,
true
);
TEST
(
Pool2dOpConverter
,
max_ceil_test
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_pool2d
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
,
false
,
true
);
}
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
TEST
(
Pool2dOpConverter
,
avg_ceil_test
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_pool2d
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
,
false
,
true
,
"avg"
);
}
#endif
validator
.
Execute
(
1
);
TEST
(
Pool2dOpConverter
,
normal_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_pool2d
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
,
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_pool2d
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
,
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
test_pool2d
(
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
max_ceil_test_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_pool2d
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
,
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
max_ceil_test
)
{
test_pool2d
(
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test
)
{
test_pool2d
(
false
,
true
,
"avg"
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test2
)
{
test_pool2d2
(
false
,
true
,
"avg"
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_pool2d
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
,
false
,
true
,
"avg"
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
pool2d
);
USE_CPU_ANAKIN_CONVERTER
(
pool2d
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
pool2d
);
#endif
paddle/fluid/inference/anakin/convert/test_relu_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,18 +21,23 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
static
void
test_activation_op
(
const
std
::
string
&
op_type
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
op_type
);
PADDLE_ENFORCE
(
converter
!=
nullptr
);
template
<
typename
TargetT
>
static
void
test_activation_op
(
const
std
::
string
&
op_type
,
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"act-X"
,
{
10
,
6
,
1
,
1
});
validator
.
DeclOutputVar
(
"act-Out"
,
{
10
,
6
,
1
,
1
});
framework
::
OpDesc
desc
;
desc
.
SetType
(
op_type
);
desc
.
SetInput
(
"X"
,
{
"act-X"
});
desc
.
SetOutput
(
"Out"
,
{
"act-Out"
});
if
(
op_type
==
"leaky_relu"
)
{
desc
.
SetAttr
(
"alpha"
,
0.1
f
);
}
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
...
...
@@ -41,10 +46,30 @@ static void test_activation_op(const std::string &op_type) {
validator
.
Execute
(
5
);
}
TEST
(
sigm_op
,
test
)
{
test_activation_op
(
"relu"
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
relu_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_activation_op
<::
anakin
::
saber
::
NV
>
(
"relu"
,
ctx
,
true
);
}
TEST
(
leaky_relu_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_activation_op
<::
anakin
::
saber
::
NV
>
(
"leaky_relu"
,
ctx
,
true
);
}
#endif
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
relu
);
USE_OP
(
leaky_relu
);
USE_CPU_ANAKIN_CONVERTER
(
relu
);
USE_CPU_ANAKIN_CONVERTER
(
leaky_relu
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
relu
);
USE_ANAKIN_CONVERTER
(
leaky_relu
);
#endif
paddle/fluid/inference/anakin/convert/test_reshape_op.cc
浏览文件 @
9f1927bd
...
...
@@ -20,12 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
reshape
,
test
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"reshape"
);
ASSERT_TRUE
(
converter
);
template
<
typename
TargetT
>
void
test_reshape1_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
...
...
@@ -45,10 +45,12 @@ TEST(reshape, test) {
validator
.
Execute
(
1
);
}
TEST
(
reshape
,
test2
)
{
template
<
typename
TargetT
>
void
test_reshape2_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"reshape-X"
,
{
1
,
2
,
4
});
validator
.
DeclOutputVar
(
"reshape-Out"
,
{
1
,
4
,
2
});
...
...
@@ -66,9 +68,39 @@ TEST(reshape, test2) {
validator
.
Execute
(
1
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
reshape1_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_reshape1_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
TEST
(
reshape2_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_reshape2_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
reshape1_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_reshape2_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
TEST
(
reshape2_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_reshape2_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
reshape
);
USE_CPU_ANAKIN_CONVERTER
(
reshape
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
reshape
);
#endif
paddle/fluid/inference/anakin/convert/test_softmax_op.cc
浏览文件 @
9f1927bd
...
...
@@ -20,12 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
softmax
,
test
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"softmax"
);
ASSERT_TRUE
(
converter
);
template
<
typename
TargetT
>
void
test_softmax_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"softmax-X"
,
{
1
,
10
,
2
});
validator
.
DeclOutputVar
(
"softmax-Out"
,
{
1
,
10
,
2
});
...
...
@@ -41,9 +41,27 @@ TEST(softmax, test) {
validator
.
Execute
(
1
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
softmax_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_softmax_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
relu_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_softmax_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
softmax
);
USE_CPU_ANAKIN_CONVERTER
(
softmax
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
softmax
);
#endif
paddle/fluid/inference/anakin/convert/test_split_op.cc
浏览文件 @
9f1927bd
...
...
@@ -21,12 +21,14 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
template
<
int
Axis
>
void
AnakinSliceTest
(
const
std
::
vector
<
int
>
&
in_shape
,
template
<
typename
TargetT
,
int
Axis
>
void
AnakinSliceTest
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
,
const
std
::
vector
<
int
>
&
in_shape
,
const
std
::
vector
<
int
>
&
sections
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"split_input"
,
in_shape
);
std
::
vector
<
std
::
string
>
output_vars
;
...
...
@@ -55,51 +57,58 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
// batch = 0, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch1
)
{
AnakinSliceTest
<
1
>
({
1
,
4
,
2
,
2
},
{
2
,
2
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
1
>
(
ctx
,
true
,
{
1
,
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch1
)
{
AnakinSliceTest
<
1
>
({
1
,
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch10
)
{
AnakinSliceTest
<
1
>
({
1
,
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch10
)
{
AnakinSliceTest
<
1
>
({
1
,
3
,
2
,
2
},
{
2
,
1
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
1
>
(
ctx
,
true
,
{
1
,
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch1
)
{
AnakinSliceTest
<
2
>
({
1
,
3
,
4
,
2
},
{
2
,
2
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
2
>
(
ctx
,
true
,
{
1
,
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch1
)
{
AnakinSliceTest
<
2
>
({
1
,
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch10
)
{
AnakinSliceTest
<
2
>
({
1
,
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch10
)
{
AnakinSliceTest
<
2
>
({
1
,
3
,
3
,
2
},
{
2
,
1
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
2
>
(
ctx
,
true
,
{
1
,
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch1
)
{
AnakinSliceTest
<
3
>
({
1
,
3
,
2
,
4
},
{
2
,
2
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
3
>
(
ctx
,
true
,
{
1
,
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 0, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch1
)
{
AnakinSliceTest
<
3
>
({
1
,
3
,
2
,
3
},
{
2
,
1
});
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
NV
,
3
>
(
ctx
,
true
,
{
1
,
3
,
2
,
3
},
{
2
,
1
});
}
// batch = 10, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch10
)
{
AnakinSliceTest
<
3
>
({
1
,
3
,
2
,
4
},
{
2
,
2
});
TEST
(
split_op
,
test_different_shape_axis1_batch1_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
X86
,
1
>
(
ctx
,
false
,
{
1
,
3
,
2
,
3
},
{
2
,
1
});
}
TEST
(
split_op
,
test_different_shape_axis2_batch1_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
X86
,
2
>
(
ctx
,
false
,
{
1
,
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch10
)
{
AnakinSliceTest
<
3
>
({
1
,
3
,
2
,
3
},
{
2
,
1
});
TEST
(
split_op
,
test_different_shape_axis3_batch1_cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
AnakinSliceTest
<::
anakin
::
saber
::
X86
,
3
>
(
ctx
,
false
,
{
1
,
3
,
2
,
4
},
{
2
,
2
});
}
}
// namespace anakin
...
...
@@ -107,4 +116,7 @@ TEST(split_op, test_different_shape_axis3_batch10) {
}
// namespace paddle
USE_OP
(
split
);
USE_CPU_ANAKIN_CONVERTER
(
split
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
split
);
#endif
paddle/fluid/inference/anakin/convert/test_sum_op.cc
浏览文件 @
9f1927bd
...
...
@@ -22,10 +22,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
sum
,
native
)
{
template
<
typename
TargetT
>
static
void
test_sum_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"sum_x1"
,
{
1
,
2
,
1
,
2
});
validator
.
DeclInputVar
(
"sum_x2"
,
{
1
,
2
,
1
,
2
});
validator
.
DeclOutputVar
(
"sum_out"
,
{
1
,
2
,
1
,
2
});
...
...
@@ -40,9 +42,26 @@ TEST(sum, native) {
validator
.
Execute
(
1
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
sum_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_sum_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
sum_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_sum_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
sum
);
USE_CPU_ANAKIN_CONVERTER
(
sum
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
sum
);
#endif
paddle/fluid/inference/anakin/convert/test_transpose_op.cc
浏览文件 @
9f1927bd
...
...
@@ -20,12 +20,12 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
TEST
(
transpose_op
,
test
)
{
auto
*
converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"transpose"
);
ASSERT_TRUE
(
converter
!=
nullptr
);
template
<
typename
TargetT
>
void
test_transpose1_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"transpose-X"
,
{
2
,
3
,
4
,
5
});
validator
.
DeclOutputVar
(
"transpose-Out"
,
{
4
,
2
,
5
,
3
});
...
...
@@ -43,11 +43,12 @@ TEST(transpose_op, test) {
validator
.
Execute
(
3
);
}
// test input shape's dims < 4
TEST
(
transpose_op
,
test2
)
{
template
<
typename
TargetT
>
void
test_transpose2_op
(
const
platform
::
DeviceContext
&
context
,
bool
use_gpu
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
AnakinConvertValidation
validator
(
parameters
,
&
scope
);
AnakinConvertValidation
<
TargetT
,
::
anakin
::
Precision
::
FP32
>
validator
(
parameters
,
&
scope
,
context
,
use_gpu
);
validator
.
DeclInputVar
(
"transpose-X"
,
{
3
,
4
,
5
});
validator
.
DeclOutputVar
(
"transpose-Out"
,
{
3
,
5
,
4
});
...
...
@@ -65,9 +66,38 @@ TEST(transpose_op, test2) {
validator
.
Execute
(
1
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
transpose1_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_transpose1_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
TEST
(
transpose2_op
,
gpu
)
{
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
CUDADeviceContext
ctx
(
gpu_place
);
test_transpose2_op
<::
anakin
::
saber
::
NV
>
(
ctx
,
true
);
}
#endif
TEST
(
transpose1_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_transpose2_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
TEST
(
transpose2_op
,
cpu
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
test_transpose2_op
<::
anakin
::
saber
::
X86
>
(
ctx
,
false
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
USE_OP
(
transpose
);
USE_CPU_ANAKIN_CONVERTER
(
transpose
);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER
(
transpose
);
#endif
paddle/fluid/inference/anakin/convert/transpose.cc
浏览文件 @
9f1927bd
...
...
@@ -17,20 +17,16 @@
#include <string>
#include <vector>
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
namespace
anakin
{
void
TransposeOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
void
TransposeOpConverter
<
TargetT
,
PrecisionT
>::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
BlockDesc
&
block_desc
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
...
...
@@ -38,7 +34,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
auto
input
=
op_desc
.
Input
(
"X"
).
front
();
auto
output
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Permute"
,
{
input
},
{
output
});
this
->
engine_
->
AddOp
(
op_name
,
"Permute"
,
{
input
},
{
output
});
auto
axis
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"axis"
));
size_t
axis_size
=
axis
.
size
();
...
...
@@ -46,11 +42,25 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
axis
.
push_back
(
axis_size
);
axis_size
+=
1
;
}
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"dims"
,
axis
);
this
->
engine_
->
template
AddOpAttr
<
PTuple
<
int
>
>
(
op_name
,
"dims"
,
axis
);
}
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
REGISTER_ANAKIN_OP_CONVERTER
(
transpose
,
TransposeOpConverter
);
#ifdef PADDLE_WITH_CUDA
using
transpose_nv_fp32
=
::
paddle
::
inference
::
anakin
::
TransposeOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>
;
using
transpose_nv_int8
=
::
paddle
::
inference
::
anakin
::
TransposeOpConverter
<
::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CUDA_ANAKIN_OP_CONVERTER
(
transpose
,
transpose_nv_fp32
);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER
(
transpose
,
transpose_nv_int8
);
#endif
using
transpose_cpu_fp32
=
::
paddle
::
inference
::
anakin
::
TransposeOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>
;
using
transpose_cpu_int8
=
::
paddle
::
inference
::
anakin
::
TransposeOpConverter
<
::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>
;
REGISTER_CPU_ANAKIN_OP_CONVERTER
(
transpose
,
transpose_cpu_fp32
);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER
(
transpose
,
transpose_cpu_int8
);
paddle/fluid/inference/anakin/convert/transpose.h
浏览文件 @
9f1927bd
...
...
@@ -20,7 +20,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
class
TransposeOpConverter
:
public
AnakinOpConverter
{
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
TransposeOpConverter
:
public
AnakinOpConverter
<
TargetT
,
PrecisionT
>
{
public:
TransposeOpConverter
()
=
default
;
...
...
paddle/fluid/inference/anakin/convert/ut_helper.h
浏览文件 @
9f1927bd
...
...
@@ -32,14 +32,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
namespace
paddle
{
namespace
inference
{
...
...
@@ -55,8 +49,8 @@ float random(float low, float high) {
return
dist
(
mt
);
}
void
RandomizeTensor
(
framework
::
LoDTensor
*
tensor
,
const
platform
::
Place
&
place
,
const
platform
::
DeviceContext
&
ctx
)
{
void
RandomizeTensor
(
framework
::
LoDTensor
*
tensor
,
const
platform
::
Place
&
place
)
{
auto
dims
=
tensor
->
dims
();
size_t
num_elements
=
analysis
::
AccuDims
(
dims
,
dims
.
size
());
PADDLE_ENFORCE_GT
(
num_elements
,
0
);
...
...
@@ -78,17 +72,19 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
* anakin
* layer.
*/
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionT
>
class
AnakinConvertValidation
{
using
AnakinNvEngineT
=
AnakinEngine
<
NV
,
Precision
::
FP32
>
;
using
AnakinNvEngineT
=
AnakinEngine
<
TargetT
,
PrecisionT
>
;
public:
AnakinConvertValidation
()
=
delete
;
AnakinConvertValidation
(
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
framework
::
Scope
*
scope
)
:
parameters_
(
parameters
),
scope_
(
scope
),
place_
(
0
)
{
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
.
reset
(
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
true
));
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
&
ctx
,
bool
use_gpu
=
true
)
:
parameters_
(
parameters
),
scope_
(
scope
),
ctx_
(
ctx
),
use_gpu_
(
use_gpu
)
{
engine_
.
reset
(
new
AnakinEngine
<
TargetT
,
PrecisionT
>
(
true
));
}
// Declare a Variable as input with random initialization.
...
...
@@ -108,11 +104,10 @@ class AnakinConvertValidation {
}
void
DeclVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
dim_vec
)
{
platform
::
CUDADeviceContext
ctx
(
place_
);
auto
*
x
=
scope_
->
Var
(
name
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
RandomizeTensor
(
x_tensor
,
place_
,
ctx
);
RandomizeTensor
(
x_tensor
,
ctx_
.
GetPlace
()
);
std
::
vector
<
int64_t
>
dim_vec_int64
;
for
(
auto
&
ele
:
dim_vec
)
{
...
...
@@ -132,7 +127,7 @@ class AnakinConvertValidation {
// should init anakin engine here.
auto
&
block_desc
=
program_desc_
.
Block
(
framework
::
kRootBlockIndex
);
Singleton
<
AnakinOpConverter
>::
Global
().
ConvertOp
(
Singleton
<
AnakinOpConverter
<
TargetT
,
PrecisionT
>
>::
Global
().
ConvertOp
(
desc
,
block_desc
,
parameters_
,
*
scope_
,
engine_
.
get
(),
true
/*test_mode*/
);
engine_
->
Freeze
();
...
...
@@ -151,7 +146,7 @@ class AnakinConvertValidation {
}
engine_
->
SetMaxInputShape
(
temp_max_input_shape
);
engine_
->
Optimize
();
engine_
->
Init
Graph
();
engine_
->
Init
Net
();
}
// We use the set 'neglected_output' here, because some Ops like batch norm,
...
...
@@ -160,11 +155,8 @@ class AnakinConvertValidation {
void
Execute
(
int
batch_size
,
std
::
unordered_set
<
std
::
string
>
neglected_output
=
{})
{
// Execute Fluid Op
platform
::
CUDADeviceContext
ctx
(
place_
);
op_
->
Run
(
*
scope_
,
place_
);
op_
->
Run
(
*
scope_
,
ctx_
.
GetPlace
());
// std::vector<framework::LoDTensor> input_vector;
// std::vector<framework::LoDTensor> output_vector;
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
inputs
;
for
(
const
auto
&
input
:
op_desc_
->
InputArgumentNames
())
{
if
(
parameters_
.
count
(
input
))
continue
;
...
...
@@ -180,20 +172,27 @@ class AnakinConvertValidation {
std
::
vector
<
float
>
fluid_out
;
auto
*
var
=
scope_
->
FindVar
(
output
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
fluid_out
);
framework
::
TensorToVector
(
*
tensor
,
ctx
_
,
&
fluid_out
);
fluid_outputs
.
push_back
(
fluid_out
);
outputs
.
insert
({
output
,
tensor
});
}
engine_
->
Execute
(
inputs
,
outputs
,
stream_
);
if
(
!
use_gpu_
)
{
engine_
->
Execute
(
inputs
,
outputs
);
}
else
{
cudaStream_t
stream
;
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream
),
0
);
engine_
->
Execute
(
inputs
,
outputs
,
stream
);
}
int
i_output
=
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
std
::
vector
<
float
>
anakin_out
;
auto
*
var
=
scope_
->
FindVar
(
output
);
auto
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorToVector
(
*
tensor
,
ctx
,
&
anakin_out
);
framework
::
TensorToVector
(
*
tensor
,
ctx
_
,
&
anakin_out
);
size_t
anakin_out_size
=
anakin_out
.
size
();
auto
fluid_out
=
fluid_outputs
[
i_output
++
];
...
...
@@ -205,15 +204,24 @@ class AnakinConvertValidation {
private:
std
::
unique_ptr
<
AnakinNvEngineT
>
engine_
{
nullptr
};
cudaStream_t
stream_
;
std
::
unique_ptr
<
framework
::
OperatorBase
>
op_
;
std
::
unique_ptr
<
framework
::
OpDesc
>
op_desc_
;
framework
::
ProgramDesc
program_desc_
;
const
std
::
unordered_set
<
std
::
string
>&
parameters_
;
framework
::
Scope
*
scope_
;
platform
::
CUDAPlace
place_
;
const
platform
::
DeviceContext
&
ctx_
;
bool
use_gpu_
{
true
};
};
template
class
AnakinConvertValidation
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinConvertValidation
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinConvertValidation
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>;
template
class
AnakinConvertValidation
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>;
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/engine.cc
浏览文件 @
9f1927bd
...
...
@@ -35,12 +35,14 @@ namespace anakin {
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
AnakinEngine
(
bool
need_summary
,
int
device
,
int
max_batch_size
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
)
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
,
std
::
vector
<
std
::
string
>
program_inputs
)
:
graph_
(
new
AnakinGraphT
<
TargetT
,
PrecisionType
>
()),
net_
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
need_summary
))
{
device_
=
device
;
max_batch_size_
=
max_batch_size
;
max_input_shape_
=
max_input_shape
;
program_inputs_
=
program_inputs
;
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
...
...
@@ -54,7 +56,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::SetInputShape(
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Init
Graph
()
{
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Init
Net
()
{
net_
->
init
(
*
graph_
);
}
...
...
@@ -67,11 +69,11 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
)
{
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
BindInput
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
)
{
#ifdef PADDLE_WITH_CUDA
cudaDeviceSynchronize
();
#endif
for
(
const
auto
&
input
:
inputs
)
{
auto
*
tensor
=
input
.
second
;
auto
*
data
=
tensor
->
data
<
float
>
();
...
...
@@ -85,16 +87,53 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
int
max_shape_sum
=
std
::
accumulate
(
max_input_shape
.
begin
(),
max_input_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
PADDLE_ENFORCE
(
max_shape_sum
>=
tensor
->
numel
(),
"The anakin input max shape should be greater than"
" or equal to the real input shape, Please set the max "
"input shape using EnableAnakinEngine"
);
if
(
tensor
->
numel
()
>
max_shape_sum
)
{
PADDLE_ENFORCE
(
std
::
find
(
program_inputs_
.
begin
(),
program_inputs_
.
end
(),
input
.
first
)
==
program_inputs_
.
end
(),
"The anakin input max shape should be greater than"
" or equal to the real input shape, Please set the max "
"input shape using EnableAnakinEngine"
);
VLOG
(
3
)
<<
"Anakin Net will be reset because of the inputs out of range: "
<<
input
.
first
;
graph_
->
Reshape
(
input
.
first
,
fluid_input_shape
);
net_
.
reset
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
true
));
net_
->
init
(
*
graph_
);
anakin_input
=
net_
->
get_in
(
input
.
first
);
}
anakin_input
->
reshape
(
fluid_input_shape
);
::
anakin
::
saber
::
Tensor
<
TargetT
>
tmp_anakin_tensor
(
data
,
TargetT
(),
0
,
fluid_input_shape
);
anakin_input
->
copy_from
(
tmp_anakin_tensor
);
}
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
)
{
BindInput
(
inputs
);
net_
->
prediction
();
for
(
const
auto
&
output
:
outputs
)
{
platform
::
CPUPlace
cpu_place
;
auto
*
tensor
=
output
.
second
;
auto
*
anakin_output
=
net_
->
get_out
(
output
.
first
);
auto
*
anakin_data
=
anakin_output
->
data
();
auto
anakin_output_shape
=
anakin_output
->
valid_shape
();
tensor
->
Resize
(
framework
::
make_ddim
(
anakin_output_shape
));
auto
*
fluid_data
=
tensor
->
mutable_data
<
float
>
(
cpu_place
);
memory
::
Copy
(
cpu_place
,
static_cast
<
void
*>
(
fluid_data
),
cpu_place
,
static_cast
<
void
*>
(
anakin_data
),
tensor
->
numel
()
*
sizeof
(
float
));
}
}
#ifdef PADDLE_WITH_CUDA
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
)
{
BindInput
(
inputs
);
net_
->
prediction
();
cudaDeviceSynchronize
();
for
(
const
auto
&
output
:
outputs
)
{
...
...
@@ -111,10 +150,11 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
}
cudaDeviceSynchronize
();
}
#endif
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Freeze
()
{
PADDLE_ENFORCE
(
graph_
->
Freeze
_v3
(),
"Freeze anakin subgraph."
);
PADDLE_ENFORCE
(
graph_
->
Freeze
(),
"Freeze anakin subgraph."
);
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
...
...
@@ -122,6 +162,12 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Optimize() {
PADDLE_ENFORCE
(
graph_
->
Optimize
(),
"Graph optimization."
);
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
RegistBlock
(
::
anakin
::
PBlock
<
TargetT
>
*
block_p
)
{
PADDLE_ENFORCE
(
graph_
->
RegistBlock
(
block_p
),
"Block register."
);
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
std
::
unique_ptr
<
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Clone
()
{
...
...
@@ -130,7 +176,24 @@ AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
return
std
::
unique_ptr
<
AnakinEngine
>
(
engine
);
}
#ifdef PADDLE_WITH_CUDA
template
class
AnakinEngine
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinEngineManager
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinEngine
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>;
template
class
AnakinEngineManager
<::
anakin
::
saber
::
NV
,
::
anakin
::
Precision
::
INT8
>;
#endif
template
class
AnakinEngine
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinEngineManager
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
FP32
>;
template
class
AnakinEngine
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>;
template
class
AnakinEngineManager
<::
anakin
::
saber
::
X86
,
::
anakin
::
Precision
::
INT8
>;
// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/engine.h
浏览文件 @
9f1927bd
...
...
@@ -32,7 +32,6 @@
#include "saber/saber_types.h"
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
namespace
anakin
{
...
...
@@ -58,9 +57,10 @@ class AnakinEngine {
public:
explicit
AnakinEngine
(
bool
need_summary
=
false
,
int
device
=
0
,
int
max_batch_size
=
1
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
=
{});
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
=
{},
std
::
vector
<
std
::
string
>
program_inputs
=
{});
~
AnakinEngine
();
void
Init
Graph
();
void
Init
Net
();
void
SetInputShape
(
const
std
::
string
&
name
,
std
::
vector
<
int
>
shape
);
void
AddOp
(
const
std
::
string
&
name
,
const
std
::
string
&
type
,
const
std
::
vector
<
std
::
string
>
&
inputs
,
...
...
@@ -81,20 +81,35 @@ class AnakinEngine {
void
SetMaxInputShape
(
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
shape
)
{
max_input_shape_
=
shape
;
}
const
std
::
vector
<
std
::
string
>
&
GetScalableInputs
()
{
return
program_inputs_
;
}
void
SetScalableInputs
(
std
::
vector
<
std
::
string
>
program_inputs
)
{
program_inputs_
=
program_inputs
;
}
int
GetMaxBatchSize
()
{
return
max_batch_size_
;
}
void
Freeze
();
void
Optimize
();
void
AllocTmpMem
()
{
PADDLE_ENFORCE
(
net_
->
alloc_memory_first
(
*
graph_
),
"anakin alloc temp memory first failed"
);
}
void
RegistBlock
(
::
anakin
::
PBlock
<
TargetT
>
*
block_p
);
void
Save
(
std
::
string
path
)
{
graph_
->
save
(
path
);
}
bool
IsInit
()
{
return
initialized_
;
}
int
GetDevice
()
{
return
device_
;
}
void
AddTensorScale
(
const
std
::
string
&
tensor_name
,
float
scale
)
{
tensor_scales_
[
tensor_name
]
=
scale
;
}
std
::
unordered_map
<
std
::
string
,
float
>
GetTensorScales
()
{
return
tensor_scales_
;
}
void
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
);
#ifdef PADDLE_WITH_CUDA
void
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
);
#endif
private:
void
BindInput
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
);
private:
bool
initialized_
{
false
};
...
...
@@ -103,27 +118,30 @@ class AnakinEngine {
int
device_
;
std
::
unique_ptr
<
GraphT
>
graph_
;
std
::
unique_ptr
<
NetT
>
net_
;
std
::
vector
<
std
::
string
>
program_inputs_
;
std
::
unordered_map
<
std
::
string
,
float
>
tensor_scales_
;
};
template
<
typename
TargetT
,
::
anakin
::
Precision
PrecisionType
>
class
AnakinEngineManager
{
using
Anakin
NvEngineT
=
AnakinEngine
<
NV
,
Precision
::
FP32
>
;
using
Anakin
EngineT
=
AnakinEngine
<
TargetT
,
PrecisionType
>
;
public:
bool
HasEngine
(
const
std
::
string
&
name
)
const
{
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
}
Anakin
Nv
EngineT
*
Get
(
const
std
::
string
&
name
)
const
{
AnakinEngineT
*
Get
(
const
std
::
string
&
name
)
const
{
return
engines_
.
at
(
name
).
get
();
}
Anakin
NvEngineT
*
Create
(
bool
need_summary
,
int
device
,
int
max_batch_siz
e
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
,
std
::
string
engine_name
)
{
Anakin
EngineT
*
Create
(
bool
need_summary
,
int
device
,
int
max_batch_size
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shap
e
,
std
::
vector
<
std
::
string
>
program_inputs
,
std
::
string
engine_name
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
auto
*
p
=
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
need_summary
,
device
,
max_batch_size
,
max_input_shape
);
auto
*
p
=
new
AnakinEngine
<
TargetT
,
PrecisionType
>
(
need_summary
,
device
,
max_batch_size
,
max_input_shape
,
program_inputs
);
engines_
[
engine_name
].
reset
(
p
);
return
p
;
}
...
...
@@ -135,7 +153,7 @@ class AnakinEngineManager {
}
private:
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
Anakin
Nv
EngineT
>>
engines_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
AnakinEngineT
>>
engines_
;
std
::
mutex
mut_
;
};
}
// namespace anakin
...
...
paddle/fluid/inference/anakin/op_teller.cc
浏览文件 @
9f1927bd
...
...
@@ -44,6 +44,8 @@ struct SimpleOpTypeSetTeller : public Teller {
teller_set
.
insert
(
"sum"
);
teller_set
.
insert
(
"depthwise_conv2d"
);
teller_set
.
insert
(
"prior_box"
);
teller_set
.
insert
(
"leaky_relu"
);
teller_set
.
insert
(
"affine_channel"
);
}
bool
operator
()(
const
std
::
string
&
op_type
,
...
...
paddle/fluid/inference/anakin/test_anakin_engine.cc
浏览文件 @
9f1927bd
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include "paddle/fluid/inference/anakin/engine.h"
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
...
...
@@ -52,11 +51,9 @@ TEST_F(TestAnakinEngine, Execute) {
engine_
->
AddOpAttr
(
"op1"
,
"axis"
,
1
);
std
::
vector
<
int
>
shape
=
{
1
,
1
,
1
,
2
};
Shape
tmp_shape
(
shape
);
// PBlock<NV> weight1(tmp_shape);
auto
*
weight1
=
GraphGlobalMem
<
NV
>::
Global
().
template
new_block
<
AK_FLOAT
>(
tmp_shape
);
// auto *weight1 = new PBlock<NV>(tmp_shape, AK_FLOAT);
PBlock
<
NV
>
*
weight1
=
new
PBlock
<
NV
>
(
tmp_shape
,
AK_FLOAT
);
engine_
->
RegistBlock
(
weight1
);
float
*
cpu_data
=
static_cast
<
float
*>
(
weight1
->
h_tensor
().
mutable_data
());
cpu_data
[
0
]
=
2.
;
weight1
->
d_tensor
().
set_shape
(
tmp_shape
);
...
...
@@ -68,7 +65,7 @@ TEST_F(TestAnakinEngine, Execute) {
// engine_->AddOpAttr("x", "input_shape", input_shape);
engine_
->
SetInputShape
(
"x"
,
{
1
,
1
,
1
,
1
});
engine_
->
Optimize
();
engine_
->
Init
Graph
();
engine_
->
Init
Net
();
framework
::
LoDTensor
x
;
framework
::
LoDTensor
y
;
x
.
Resize
({
1
,
1
,
1
,
1
});
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
9f1927bd
...
...
@@ -64,20 +64,20 @@ struct Argument {
bool
Has
(
const
std
::
string
&
key
)
const
{
return
valid_fields_
.
count
(
key
);
}
#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
public: \
type__& field__() { \
PADDLE_ENFORCE(Has(#field__)
);
\
return field__##_; \
} \
void Set##Field(const type__& x) { \
field__##_ = x; \
valid_fields_.insert(#field__); \
} \
DECL_ARGUMENT_FIELD_VALID(field__); \
type__* field__##_ptr() { return &field__##_; } \
\
private: \
#define DECL_ARGUMENT_FIELD(field__, Field, type__)
\
public:
\
type__& field__() {
\
PADDLE_ENFORCE(Has(#field__)
, "There is no such field");
\
return field__##_;
\
}
\
void Set##Field(const type__& x) {
\
field__##_ = x;
\
valid_fields_.insert(#field__);
\
}
\
DECL_ARGUMENT_FIELD_VALID(field__);
\
type__* field__##_ptr() { return &field__##_; }
\
\
private:
\
type__ field__##_;
#define DECL_ARGUMENT_FIELD_VALID(field__) \
...
...
@@ -169,7 +169,13 @@ struct Argument {
anakin_max_shape_t
);
DECL_ARGUMENT_FIELD
(
anakin_max_batch_size
,
AnakinMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
anakin_min_subgraph_size
,
AnakinMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
anakin_precision_mode
,
AnakinPrecisionMode
,
AnalysisConfig
::
Precision
);
DECL_ARGUMENT_FIELD
(
use_anakin
,
UseAnakin
,
bool
);
DECL_ARGUMENT_FIELD
(
anakin_passes_filter
,
AnakinPassesFilter
,
std
::
vector
<
std
::
string
>
);
DECL_ARGUMENT_FIELD
(
anakin_ops_filter
,
AnakinOpsFilter
,
std
::
vector
<
std
::
string
>
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
9f1927bd
...
...
@@ -114,6 +114,7 @@ void IRPassManager::CreatePasses(Argument *argument,
if
(
pass_name
==
"anakin_subgraph_pass"
)
{
pass
->
Set
(
"program"
,
new
framework
::
ProgramDesc
*
(
&
argument
->
main_program
()));
pass
->
Set
(
"use_gpu"
,
new
bool
(
argument
->
use_gpu
()));
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"model_from_memory"
,
new
bool
(
argument
->
model_from_memory
()));
pass
->
Set
(
"engine_opt_info"
,
new
std
::
map
<
std
::
string
,
std
::
string
>
(
...
...
@@ -122,6 +123,11 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"max_input_shape"
,
new
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
(
argument
->
anakin_max_input_shape
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
anakin_max_batch_size
()));
bool
enable_int8
=
argument
->
anakin_precision_mode
()
==
AnalysisConfig
::
Precision
::
kInt8
;
pass
->
Set
(
"enable_int8"
,
new
bool
(
enable_int8
));
pass
->
Set
(
"anakin_ops_filter"
,
new
std
::
vector
<
std
::
string
>
(
argument
->
anakin_ops_filter
()));
}
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
浏览文件 @
9f1927bd
...
...
@@ -39,8 +39,14 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
framework
::
ir
::
Graph
*
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"anakin_subgraph_pass"
,
graph
);
auto
teller
=
[](
const
framework
::
ir
::
Node
*
node
)
{
if
(
!
node
->
IsOp
()
||
!
node
->
Op
())
return
false
;
auto
&
anakin_ops_filter
=
Get
<
std
::
vector
<
std
::
string
>>
(
"anakin_ops_filter"
);
auto
teller
=
[
&
anakin_ops_filter
](
const
framework
::
ir
::
Node
*
node
)
{
if
(
!
node
->
IsOp
()
||
!
node
->
Op
())
return
false
;
else
if
(
std
::
find
(
anakin_ops_filter
.
begin
(),
anakin_ops_filter
.
end
(),
node
->
Op
()
->
Type
())
!=
anakin_ops_filter
.
end
())
return
false
;
return
anakin
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
...
...
@@ -191,22 +197,76 @@ void AnakinSubgraphPass::CreateAnakinOp(
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
auto
max_input_shape
=
Get
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
(
"max_input_shape"
);
auto
max_batch_size
=
Get
<
int
>
(
"max_batch_size"
);
auto
program_inputs
=
program_desc
->
GetFeedTargetNames
(
);
auto
*
anakin_engine
=
inference
::
Singleton
<
anakin
::
AnakinEngineManager
>::
Global
().
Create
(
true
,
Get
<
int
>
(
"gpu_device_id"
),
max_batch_size
,
max_input_shape
,
engine_key
);
bool
use_gpu
=
Get
<
bool
>
(
"use_gpu"
);
SetAttr
(
op_desc
->
Proto
(),
"use_gpu"
,
use_gpu
);
bool
enable_int8
=
Get
<
bool
>
(
"enable_int8"
);
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
if
(
enable_int8
)
{
CreateAnakinEngine
<::
anakin
::
Precision
::
INT8
>
(
&
block_desc
,
params
,
input_names
,
output_mapping
,
program_inputs
,
engine_key
);
}
else
{
CreateAnakinEngine
<::
anakin
::
Precision
::
FP32
>
(
&
block_desc
,
params
,
input_names
,
output_mapping
,
program_inputs
,
engine_key
);
}
}
template
<::
anakin
::
Precision
PrecisionT
>
void
AnakinSubgraphPass
::
CreateAnakinEngine
(
framework
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
params
,
const
std
::
set
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_mapping
,
const
std
::
vector
<
std
::
string
>
&
program_inputs
,
const
std
::
string
&
engine_key
)
const
{
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
->
Proto
());
bool
use_gpu
=
Get
<
bool
>
(
"use_gpu"
);
auto
max_batch_size
=
Get
<
int
>
(
"max_batch_size"
);
auto
max_input_shape
=
Get
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
(
"max_input_shape"
);
if
(
use_gpu
)
{
#ifdef PADDLE_WITH_CUDA
inference
::
Singleton
<
anakin
::
AnakinEngineManager
<::
anakin
::
saber
::
NV
,
PrecisionT
>>::
Global
()
.
Create
(
true
,
Get
<
int
>
(
"gpu_device_id"
),
max_batch_size
,
max_input_shape
,
program_inputs
,
engine_key
);
#endif
}
else
{
inference
::
Singleton
<
anakin
::
AnakinEngineManager
<::
anakin
::
saber
::
X86
,
PrecisionT
>>::
Global
()
.
Create
(
true
,
Get
<
int
>
(
"gpu_device_id"
),
max_batch_size
,
max_input_shape
,
program_inputs
,
engine_key
);
}
auto
*
scope
=
param_scope
();
std
::
unordered_set
<
std
::
string
>
param_set
(
params
.
begin
(),
params
.
end
());
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
.
Proto
());
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
>::
Global
()
.
ConvertBlockToAnakinEngine
(
&
block_desc_temp
,
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
anakin_engine
);
if
(
use_gpu
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
anakin_engine
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
<
::
anakin
::
saber
::
NV
,
PrecisionT
>>::
Global
()
.
Get
(
engine_key
);
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
<
::
anakin
::
saber
::
NV
,
PrecisionT
>>::
Global
()
.
ConvertBlockToAnakinEngine
(
&
block_desc_temp
,
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
anakin_engine
);
#endif
}
else
{
auto
*
anakin_engine
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
<
::
anakin
::
saber
::
X86
,
PrecisionT
>>::
Global
()
.
Get
(
engine_key
);
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
<
::
anakin
::
saber
::
X86
,
PrecisionT
>>::
Global
()
.
ConvertBlockToAnakinEngine
(
&
block_desc_temp
,
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
anakin_engine
);
}
}
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
浏览文件 @
9f1927bd
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
...
...
@@ -36,6 +37,13 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
;
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
template
<::
anakin
::
Precision
PrecisionT
>
void
CreateAnakinEngine
(
framework
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
params
,
const
std
::
set
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_mapping
,
const
std
::
vector
<
std
::
string
>
&
program_inputs
,
const
std
::
string
&
engine_key
)
const
;
};
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
浏览文件 @
9f1927bd
...
...
@@ -100,7 +100,6 @@ void RenameAndGetOutputs(
const
std
::
string
arg_value
=
in_var
->
arguments
(
k
);
const
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
input_names_with_id
.
count
(
arg_value_with_id
))
{
replaced_names
.
push_back
(
arg_value
);
if
(
graph_var_map
.
count
(
arg_value
))
{
...
...
@@ -149,7 +148,6 @@ void RenameAndGetOutputs(
const
std
::
string
arg_value
=
out_var
->
arguments
(
k
);
const
std
::
string
arg_value_with_id
=
arg_value
+
std
::
to_string
(
var2id
[
arg_value
]);
if
(
graph_var_map
.
count
(
arg_value
))
{
add_block_var
(
arg_value
,
arg_value_with_id
);
}
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
9f1927bd
...
...
@@ -116,6 +116,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
anakin_max_batchsize_
);
CP_MEMBER
(
anakin_max_input_shape_
);
CP_MEMBER
(
anakin_min_subgraph_size_
);
CP_MEMBER
(
anakin_precision_mode_
);
CP_MEMBER
(
anakin_passes_filter_
);
CP_MEMBER
(
anakin_ops_filter_
);
// Ir related.
CP_MEMBER
(
enable_ir_optim_
);
...
...
@@ -268,13 +271,18 @@ void AnalysisConfig::Update() {
PADDLE_ENFORCE
(
!
use_tensorrt_
,
"Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!"
);
PADDLE_ENFORCE
(
use_gpu_
,
"Anakin sub-graph engine need gpu, please use the EnableGpu API."
);
if
(
use_gpu_
)
{
LOG
(
INFO
)
<<
"Run Anakin GPU mode"
;
}
else
{
LOG
(
INFO
)
<<
"Run Anakin CPU mode"
;
}
pass_builder
()
->
ClearPasses
();
for
(
const
auto
&
pass
:
kAnakinSubgraphPasses
)
{
pass_builder
()
->
AppendPass
(
pass
);
if
(
std
::
find
(
anakin_passes_filter_
.
begin
(),
anakin_passes_filter_
.
end
(),
pass
)
==
anakin_passes_filter_
.
end
())
{
pass_builder
()
->
AppendPass
(
pass
);
}
}
}
...
...
@@ -389,11 +397,16 @@ void AnalysisConfig::SwitchIrDebug(int x) {
}
void
AnalysisConfig
::
EnableAnakinEngine
(
int
max_batch_size
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
,
int
min_subgraph_size
)
{
int
min_subgraph_size
,
AnalysisConfig
::
Precision
precision_mode
,
std
::
vector
<
std
::
string
>
passes_filter
,
std
::
vector
<
std
::
string
>
ops_filter
)
{
anakin_max_batchsize_
=
max_batch_size
;
anakin_max_input_shape_
=
max_input_shape
;
anakin_min_subgraph_size_
=
min_subgraph_size
;
anakin_passes_filter_
=
passes_filter
;
anakin_ops_filter_
=
ops_filter
;
use_anakin_
=
true
;
anakin_precision_mode_
=
precision_mode
;
Update
();
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
9f1927bd
...
...
@@ -382,10 +382,13 @@ void AnalysisPredictor::PrepareArgument() {
argument_
.
SetTensorRtUseStaticEngine
(
config_
.
trt_use_static_engine_
);
}
if
(
config_
.
use_gpu
()
&&
config_
.
anakin_engine_enabled
())
{
if
(
config_
.
anakin_engine_enabled
())
{
argument_
.
SetAnakinMaxBatchSize
(
config_
.
anakin_max_batchsize_
);
argument_
.
SetAnakinMaxInputShape
(
config_
.
anakin_max_input_shape_
);
argument_
.
SetAnakinMinSubgraphSize
(
config_
.
anakin_min_subgraph_size_
);
argument_
.
SetAnakinPrecisionMode
(
config_
.
anakin_precision_mode_
);
argument_
.
SetAnakinPassesFilter
(
config_
.
anakin_passes_filter_
);
argument_
.
SetAnakinOpsFilter
(
config_
.
anakin_ops_filter_
);
LOG
(
INFO
)
<<
"Anakin subgraph engine is enabled"
;
}
...
...
@@ -888,4 +891,6 @@ USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER
(
dropout
);
USE_ANAKIN_CONVERTER
(
sum
);
USE_ANAKIN_CONVERTER
(
prior_box
);
USE_ANAKIN_CONVERTER
(
leaky_relu
);
USE_ANAKIN_CONVERTER
(
affine_channel
);
#endif
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
9f1927bd
...
...
@@ -152,7 +152,9 @@ struct AnalysisConfig {
void
EnableAnakinEngine
(
int
max_batch_size
=
1
,
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
=
{},
int
min_subgraph_size
=
6
);
int
min_subgraph_size
=
6
,
Precision
precision
=
Precision
::
kFloat32
,
std
::
vector
<
std
::
string
>
passes_filter
=
{},
std
::
vector
<
std
::
string
>
ops_filter
=
{});
/** A boolean state indicating whether the Anakin sub-graph engine is used.
*/
...
...
@@ -291,6 +293,9 @@ struct AnalysisConfig {
int
anakin_max_batchsize_
;
int
anakin_min_subgraph_size_
{
6
};
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
anakin_max_input_shape_
;
Precision
anakin_precision_mode_
;
std
::
vector
<
std
::
string
>
anakin_passes_filter_
;
std
::
vector
<
std
::
string
>
anakin_ops_filter_
;
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info_
;
bool
use_mkldnn_quantizer_
{
false
};
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
9f1927bd
...
...
@@ -73,15 +73,16 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
// The following passes works for Anakin sub-graph engine.
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
({
"infer_clean_graph_pass"
,
//
"quant_conv2d_dequant_fuse_pass"
,
//
"simplify_anakin_priorbox_detection_out_pass"
,
//
"fillconstant_elementwisemul_fuse"
,
//
"fc_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"
quant_conv2d_dequant_fuse_pass"
,
//
"
anakin_subgraph_pass"
,
//
"conv_bn_fuse_pass", //
//
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass"
,
//
"
anakin_subgraph_pass"
,
//
"
fc_gru_fuse_pass"
,
//
});
GpuPassStrategy
::
GpuPassStrategy
()
:
PassStrategy
({})
{
...
...
paddle/fluid/operators/anakin/anakin_engine_op.h
浏览文件 @
9f1927bd
...
...
@@ -34,28 +34,17 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
inference
::
Singleton
;
using
anakin
::
graph
::
GraphGlobalMem
;
using
anakin
::
AK_FLOAT
;
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
using
anakin
::
saber
::
X86
;
using
anakin
::
saber
::
Shape
;
using
anakin
::
PBlock
;
using
anakin
::
PTuple
;
using
inference
::
anakin
::
AnakinEngine
;
class
AnakinEngineOp
:
public
framework
::
OperatorBase
{
using
AnakinNvEngineT
=
AnakinEngine
<
NV
,
Precision
::
FP32
>
;
private:
std
::
vector
<
std
::
string
>
input_names_
;
std
::
unordered_set
<
std
::
string
>
param_names_
;
mutable
AnakinNvEngineT
*
anakin_engine_
;
std
::
string
engine_key_
;
std
::
string
engine_serialized_data_
;
bool
use_gpu_
;
bool
enable_int8_
;
public:
AnakinEngineOp
(
const
std
::
string
&
type
,
...
...
@@ -66,10 +55,11 @@ class AnakinEngineOp : public framework::OperatorBase {
input_names_
=
Inputs
(
"Xs"
);
engine_key_
=
Attr
<
std
::
string
>
(
"engine_key"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
use_gpu_
=
Attr
<
bool
>
(
"use_gpu"
);
enable_int8_
=
Attr
<
bool
>
(
"enable_int8"
);
for
(
const
auto
&
param
:
params
)
{
param_names_
.
insert
(
param
);
}
anakin_engine_
=
nullptr
;
}
protected:
...
...
@@ -80,19 +70,12 @@ class AnakinEngineOp : public framework::OperatorBase {
void
RunAnakin
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
auto
*
engine
=
GetEngine
(
scope
,
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
inputs
;
// Convert input tensor from fluid to engine.
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
auto
&
t
=
...
...
@@ -110,17 +93,38 @@ class AnakinEngineOp : public framework::OperatorBase {
outputs
.
insert
({
output_maps
[
output_index
],
fluid_t
});
output_index
+=
1
;
}
engine
->
Execute
(
inputs
,
outputs
,
stream
);
if
(
enable_int8_
)
{
Execute
<::
anakin
::
Precision
::
INT8
>
(
inputs
,
outputs
,
dev_place
);
}
else
{
Execute
<::
anakin
::
Precision
::
FP32
>
(
inputs
,
outputs
,
dev_place
);
}
}
AnakinNvEngineT
*
GetEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
anakin_engine_
==
nullptr
)
{
anakin_engine_
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
>::
Global
()
template
<::
anakin
::
Precision
PrecisionT
>
void
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
auto
*
engine
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
<
::
anakin
::
saber
::
NV
,
PrecisionT
>>::
Global
()
.
Get
(
engine_key_
);
engine
->
Execute
(
inputs
,
outputs
,
stream
);
#endif
}
else
{
auto
*
engine
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
<
::
anakin
::
saber
::
X86
,
PrecisionT
>>::
Global
()
.
Get
(
engine_key_
);
engine
->
Execute
(
inputs
,
outputs
);
}
return
anakin_engine_
;
}
};
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
9f1927bd
...
...
@@ -16,6 +16,7 @@
#include <pybind11/stl.h>
#include <cstring>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/analysis_predictor.h"
...
...
@@ -229,6 +230,14 @@ void BindAnalysisConfig(py::module *m) {
py
::
arg
(
"min_subgraph_size"
)
=
3
,
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
,
py
::
arg
(
"use_static"
)
=
true
)
.
def
(
"enable_anakin_engine"
,
&
AnalysisConfig
::
EnableAnakinEngine
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"max_input_shape"
)
=
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
(),
py
::
arg
(
"min_subgraph_size"
)
=
6
,
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
,
py
::
arg
(
"passes_filter"
)
=
std
::
vector
<
std
::
string
>
(),
py
::
arg
(
"ops_filter"
)
=
std
::
vector
<
std
::
string
>
())
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
py
::
arg
(
"x"
)
=
true
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录