Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
69d37f81
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
69d37f81
编写于
3月 20, 2019
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cherry-pick from feature/anakin-engine: refine anakin subgraph. #16157
support change input size
上级
a1d200a5
变更
27
隐藏空白更改
内联
并排
Showing
27 changed file
with
390 addition
and
80 deletion
+390
-80
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+14
-2
paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
...id/framework/ir/simplify_anakin_detection_pattern_pass.cc
+10
-3
paddle/fluid/inference/anakin/CMakeLists.txt
paddle/fluid/inference/anakin/CMakeLists.txt
+1
-1
paddle/fluid/inference/anakin/convert/batch_norm.cc
paddle/fluid/inference/anakin/convert/batch_norm.cc
+2
-0
paddle/fluid/inference/anakin/convert/density_prior_box.cc
paddle/fluid/inference/anakin/convert/density_prior_box.cc
+21
-10
paddle/fluid/inference/anakin/convert/op_converter.h
paddle/fluid/inference/anakin/convert/op_converter.h
+30
-0
paddle/fluid/inference/anakin/convert/pool2d.cc
paddle/fluid/inference/anakin/convert/pool2d.cc
+1
-1
paddle/fluid/inference/anakin/convert/softmax.cc
paddle/fluid/inference/anakin/convert/softmax.cc
+1
-1
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+2
-1
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+41
-0
paddle/fluid/inference/anakin/convert/ut_helper.h
paddle/fluid/inference/anakin/convert/ut_helper.h
+1
-1
paddle/fluid/inference/anakin/engine.cc
paddle/fluid/inference/anakin/engine.cc
+31
-17
paddle/fluid/inference/anakin/engine.h
paddle/fluid/inference/anakin/engine.h
+43
-2
paddle/fluid/inference/anakin/test_anakin_engine.cc
paddle/fluid/inference/anakin/test_anakin_engine.cc
+3
-4
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+9
-0
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
...luid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+45
-10
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
...fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+8
-2
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+15
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+0
-5
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+25
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-1
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+4
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+13
-0
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+12
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+3
-0
paddle/fluid/operators/anakin/anakin_engine_op.h
paddle/fluid/operators/anakin/anakin_engine_op.h
+41
-13
未找到文件。
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
69d37f81
...
...
@@ -1527,6 +1527,16 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
->
assert_is_op_output
(
"box_coder"
)
->
AsIntermediate
();
auto
transpose_before_nms
=
pattern
->
NewNode
(
GetNodeName
(
"transpose_before_nms"
))
->
assert_is_op
(
"transpose2"
);
auto
transpose_before_nms_out
=
pattern
->
NewNode
(
GetNodeName
(
"transpose_before_nms_out"
))
->
assert_is_op_output
(
"transpose2"
)
->
assert_is_op_input
(
"multiclass_nms"
,
"Scores"
)
->
AsIntermediate
();
auto
multiclass_nms_op
=
pattern
->
NewNode
(
GetNodeName
(
"multiclass_nms"
))
->
assert_is_op
(
"multiclass_nms"
)
->
assert_op_has_n_inputs
(
"multiclass_nms"
,
2
);
...
...
@@ -1577,8 +1587,10 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
{
concat_out1
,
concat_out2
,
conv_in
[
kBoxCoderThirdInputOffset
]});
box_coder_out
->
LinksFrom
({
box_coder_op
});
multiclass_nms_op
->
LinksFrom
({
box_coder_out
,
conv_in
[
kMultiClassSecondInputNmsOffset
]})
transpose_before_nms
->
LinksFrom
({
conv_in
[
kMultiClassSecondInputNmsOffset
]});
transpose_before_nms_out
->
LinksFrom
({
transpose_before_nms
});
multiclass_nms_op
->
LinksFrom
({
box_coder_out
,
transpose_before_nms_out
})
.
LinksTo
({
multiclass_nms_out
});
return
multiclass_nms_out
;
...
...
paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
浏览文件 @
69d37f81
...
...
@@ -45,7 +45,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
input_nodes
.
push_back
(
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
+
std
::
to_string
(
times
+
1
))
->
assert_is_op_input
(
"
multiclass_nms"
,
"Scores
"
)
->
assert_is_op_input
(
"
transpose2
"
)
->
AsInput
());
patterns
::
AnakinDetectionPattern
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
...
...
@@ -106,6 +106,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
Node
*
box_coder_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"box_coder_out"
));
Node
*
multiclass_nms_second_input
=
subgraph
.
at
(
input_nodes
[
times
+
1
]);
Node
*
transpose_before_nms
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_before_nms"
));
Node
*
transpose_before_nms_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_before_nms_out"
));
Node
*
multiclass_nms
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"multiclass_nms"
));
Node
*
multiclass_nms_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"multiclass_nms_out"
));
...
...
@@ -133,11 +138,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
nodes
[
i
*
kNumFields
+
kPriorBoxLocOffset
]
->
Name
());
}
int
axis
=
boost
::
get
<
int
>
(
concat_op1
->
Op
()
->
GetAttr
(
"axis"
));
//
int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
framework
::
OpDesc
concat1_desc
;
concat1_desc
.
SetType
(
"concat"
);
concat1_desc
.
SetInput
(
"X"
,
concat1_input_names
);
concat1_desc
.
SetAttr
(
"axis"
,
axis
);
concat1_desc
.
SetAttr
(
"axis"
,
2
);
concat1_desc
.
SetOutput
(
"Out"
,
{
concat_out1
->
Name
()});
auto
*
new_add_concat_op
=
graph
->
CreateOpNode
(
&
concat1_desc
);
...
...
@@ -184,6 +189,8 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
delete_nodes
.
insert
(
concat_out2
);
delete_nodes
.
insert
(
box_coder_op
);
delete_nodes
.
insert
(
box_coder_out
);
delete_nodes
.
insert
(
transpose_before_nms
);
delete_nodes
.
insert
(
transpose_before_nms_out
);
delete_nodes
.
insert
(
multiclass_nms
);
new_add_concat_op
->
outputs
.
push_back
(
concat_out1
);
...
...
paddle/fluid/inference/anakin/CMakeLists.txt
浏览文件 @
69d37f81
cc_library
(
anakin_engine SRCS engine.cc
)
nv
_library
(
anakin_op_teller SRCS op_teller.cc DEPS framework_proto
)
cc
_library
(
anakin_op_teller SRCS op_teller.cc DEPS framework_proto
)
target_link_libraries
(
anakin_engine anakin anakin_saber_common
)
cc_test
(
test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine
)
add_subdirectory
(
convert
)
paddle/fluid/inference/anakin/convert/batch_norm.cc
浏览文件 @
69d37f81
...
...
@@ -43,11 +43,13 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
auto
output
=
op_desc
.
Output
(
"Y"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Y"
).
front
();
auto
epsilon
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"epsilon"
));
// auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
auto
bn_op_name
=
op_name
+
":bn"
;
auto
bn_output
=
bn_op_name
+
"_output"
;
engine_
->
AddOp
(
bn_op_name
,
"BatchNorm"
,
{
inputs
[
"X"
]},
{
bn_output
});
engine_
->
AddOpAttr
(
bn_op_name
,
"epsilon"
,
epsilon
);
engine_
->
AddOpAttr
(
bn_op_name
,
"momentum"
,
static_cast
<
float
>
(
1.0
));
auto
scale_op_name
=
op_name
+
":scale"
;
auto
get_lod_tensor
=
[
this
,
&
scope
,
&
op_name
](
const
std
::
string
&
var_name
,
...
...
paddle/fluid/inference/anakin/convert/density_prior_box.cc
浏览文件 @
69d37f81
...
...
@@ -27,8 +27,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
void
DensityPriorBoxOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
void
DensityPriorBoxOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
input_name
=
op_desc
.
Input
(
"Input"
).
front
();
...
...
@@ -42,34 +42,45 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
auto
fixed_ratios
=
boost
::
get
<
std
::
vector
<
float
>>
(
op_desc
.
GetAttr
(
"fixed_ratios"
));
auto
densities
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"densities"
));
std
::
vector
<
float
>
dens
;
for
(
auto
&
ele
:
densities
)
{
dens
.
push_back
(
static_cast
<
float
>
(
ele
));
}
// lack flip
auto
clip
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"clip"
));
//
auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
auto
variances
=
boost
::
get
<
std
::
vector
<
float
>>
(
op_desc
.
GetAttr
(
"variances"
));
for
(
auto
&
ele
:
variances
)
{
LOG
(
INFO
)
<<
ele
;
}
// lack img_h, img_w
auto
step_h
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"step_h"
));
auto
step_w
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"step_w"
));
auto
offset
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"offset"
));
std
::
vector
<
std
::
string
>
order
=
{
"MIN"
,
"COM"
,
"MAX"
};
PTuple
<
std
::
string
>
t_order
;
t_order
.
push_back
(
"MIN"
);
t_order
.
push_back
(
"COM"
);
t_order
.
push_back
(
"MAX"
);
std
::
vector
<
float
>
temp_v
=
{};
engine_
->
AddOp
(
op_name
,
"PriorBox"
,
{
input_name
,
image_name
},
{
output_name
});
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"min_size"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"max_size"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"aspect_ratio"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_size
s
"
,
fixed_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_ratio
s
"
,
fixed_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"density"
,
densitie
s
);
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
false
);
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
clip
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_size"
,
fixed_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_ratio"
,
fixed_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"density"
,
den
s
);
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
static_cast
<
bool
>
(
false
)
);
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
static_cast
<
bool
>
(
false
)
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"variance"
,
variances
);
engine_
->
AddOpAttr
(
op_name
,
"img_h"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"img_w"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"step_h"
,
step_h
);
engine_
->
AddOpAttr
(
op_name
,
"step_w"
,
step_w
);
engine_
->
AddOpAttr
(
op_name
,
"offset"
,
offset
);
engine_
->
AddOpAttr
<
PTuple
<
std
::
string
>>
(
op_name
,
"order"
,
order
);
engine_
->
AddOpAttr
<
PTuple
<
std
::
string
>>
(
op_name
,
"order"
,
t_
order
);
}
}
// namespace anakin
...
...
paddle/fluid/inference/anakin/convert/op_converter.h
浏览文件 @
69d37f81
...
...
@@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "framework/core/types.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -68,6 +69,35 @@ class AnakinOpConverter {
ConvertOp
(
op
,
parameters
,
scope
,
engine
);
}
}
// The scope here should be inited with the parameter vars.
void
ConvertBlockToAnakinEngine
(
framework
::
BlockDesc
*
block_desc
,
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>
&
inputs
,
const
std
::
unordered_set
<
std
::
string
>
&
parameters
,
const
std
::
vector
<
std
::
string
>
&
outputs
,
AnakinNvEngine
*
engine
)
{
framework
::
proto
::
BlockDesc
*
block_proto
=
block_desc
->
Proto
();
ConvertBlock
(
*
block_proto
,
parameters
,
scope
,
engine
);
engine
->
Freeze
();
for
(
auto
&
input
:
inputs
)
{
if
(
parameters
.
count
(
input
))
continue
;
auto
*
var
=
block_desc
->
FindVar
(
input
);
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
auto
var_shape
=
var
->
GetShape
();
PADDLE_ENFORCE
(
var_shape
.
size
()
==
4
);
std
::
vector
<
int
>
input_shape
;
for
(
int
i
=
0
;
i
<
var_shape
.
size
();
i
++
)
{
input_shape
.
push_back
(
var_shape
[
i
]);
}
input_shape
[
0
]
=
1
;
engine
->
SetInputShape
(
input
,
input_shape
);
}
engine
->
Optimize
();
engine
->
InitGraph
();
}
void
SetEngine
(
AnakinNvEngine
*
engine
)
{
engine_
=
engine
;
}
virtual
~
AnakinOpConverter
()
{}
...
...
paddle/fluid/inference/anakin/convert/pool2d.cc
浏览文件 @
69d37f81
...
...
@@ -55,7 +55,7 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
if
(
pool_type
==
"max"
)
{
anakin_pool_type
=
"MAX"
;
}
else
if
(
pool_type
==
"avg"
)
{
anakin_pool_type
=
"AVG"
;
anakin_pool_type
=
"AVG
EXC
"
;
}
else
{
PADDLE_THROW
(
"TensorRT unsupported pooling type!"
);
}
...
...
paddle/fluid/inference/anakin/convert/softmax.cc
浏览文件 @
69d37f81
...
...
@@ -33,7 +33,7 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto
output
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Softmax"
,
{
input
},
{
output
});
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
2
);
}
}
// namespace anakin
...
...
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
浏览文件 @
69d37f81
...
...
@@ -52,8 +52,9 @@ TEST(batch_norm_op, test) {
desc
.
SetOutput
(
"SavedVariance"
,
{
"batch_norm_save_variance"
});
float
eps
=
1e-5
f
;
bool
is_test
=
true
;
desc
.
SetAttr
(
"epsilon"
,
eps
);
desc
.
SetAttr
(
"is_test"
,
true
);
desc
.
SetAttr
(
"is_test"
,
is_test
);
validator
.
SetOp
(
*
desc
.
Proto
());
...
...
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
浏览文件 @
69d37f81
...
...
@@ -64,11 +64,52 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
validator
.
Execute
(
1
);
}
void
test_pool2d2
(
bool
global_pooling
,
bool
ceil_mode
,
std
::
string
pool_type
=
"max"
)
{
auto
*
pool2d_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"pool2d"
);
ASSERT_TRUE
(
pool2d_converter
);
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
scope
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator
.
DeclInputVar
(
"pool2d_x"
,
{
1
,
1
,
17
,
17
});
validator
.
DeclOutputVar
(
"pool2d_out"
,
{
1
,
1
,
17
,
17
});
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"pool2d"
);
desc
.
SetInput
(
"X"
,
{
"pool2d_x"
});
desc
.
SetOutput
(
"Out"
,
{
"pool2d_out"
});
std
::
vector
<
int
>
ksize
({
3
,
3
});
std
::
vector
<
int
>
strides
({
1
,
1
});
std
::
vector
<
int
>
paddings
({
1
,
1
});
std
::
string
pooling_t
=
pool_type
;
desc
.
SetAttr
(
"pooling_type"
,
pooling_t
);
desc
.
SetAttr
(
"ksize"
,
ksize
);
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"global_pooling"
,
global_pooling
);
desc
.
SetAttr
(
"ceil_mode"
,
true
);
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
1
);
}
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
test_pool2d
(
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
max_ceil_test
)
{
test_pool2d
(
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test
)
{
test_pool2d
(
false
,
true
,
"avg"
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test2
)
{
test_pool2d2
(
false
,
true
,
"avg"
);
}
}
// namespace anakin
}
// namespace inference
...
...
paddle/fluid/inference/anakin/convert/ut_helper.h
浏览文件 @
69d37f81
...
...
@@ -168,7 +168,7 @@ class AnakinConvertValidation {
outputs
.
insert
({
output
,
tensor
});
}
engine_
->
Execute
(
inputs
,
outputs
);
engine_
->
Execute
(
inputs
,
outputs
,
stream_
);
int
i_output
=
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
...
...
paddle/fluid/inference/anakin/engine.cc
浏览文件 @
69d37f81
...
...
@@ -33,9 +33,12 @@ namespace inference {
namespace
anakin
{
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
AnakinEngine
(
bool
need_summary
)
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
AnakinEngine
(
bool
need_summary
,
int
device
)
:
graph_
(
new
AnakinGraphT
<
TargetT
,
PrecisionType
>
()),
net_
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
need_summary
))
{}
net_
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
need_summary
))
{
device_
=
device
;
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::~
AnakinEngine
()
{}
...
...
@@ -63,33 +66,44 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
)
{
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
)
{
for
(
const
auto
&
input
:
inputs
)
{
auto
*
tensor
=
input
.
second
;
auto
*
data
=
tensor
->
data
<
float
>
();
auto
shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
fluid_input_shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
*
anakin_input
=
net_
->
get_in
(
input
.
first
);
auto
anakin_input_shape
=
anakin_input
->
valid_shape
();
PADDLE_ENFORCE
(
tensor
->
numel
(),
anakin_input_shape
.
count
(),
"the fluid input size should be equal to anakin"
);
auto
net_shape
=
anakin_input
->
shape
();
if
(
tensor
->
numel
()
>
net_shape
.
count
())
{
graph_
->
Reshape
(
input
.
first
,
fluid_input_shape
);
net_
.
reset
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
true
));
net_
->
init
(
*
graph_
);
anakin_input
=
net_
->
get_in
(
input
.
first
);
}
anakin_input
->
reshape
(
fluid_input_shape
);
net_shape
=
anakin_input
->
shape
();
::
anakin
::
saber
::
Tensor
<
TargetT
>
tmp_anakin_tensor
(
data
,
TargetT
(),
0
,
anakin_inpu
t_shape
);
anakin_input
->
copy
_from
(
tmp_anakin_tensor
);
ne
t_shape
);
anakin_input
->
share
_from
(
tmp_anakin_tensor
);
}
net_
->
prediction
();
for
(
const
auto
&
output
:
outputs
)
{
platform
::
CUDAPlace
gpu_place
(
device_
);
auto
*
tensor
=
output
.
second
;
auto
*
data
=
tensor
->
data
<
float
>
();
auto
shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
*
anakin_output
=
net_
->
get_out
(
output
.
first
);
auto
*
anakin_data
=
anakin_output
->
data
();
auto
anakin_output_shape
=
anakin_output
->
valid_shape
();
PADDLE_ENFORCE
(
tensor
->
numel
(),
anakin_output_shape
.
count
(),
"the fluid output size should be equal to anakin"
);
::
anakin
::
saber
::
Tensor
<
TargetT
>
tmp_anakin_tensor
(
data
,
TargetT
(),
0
,
anakin_output_shape
);
anakin_output
->
share_from
(
tmp_anakin_tensor
);
tensor
->
Resize
(
framework
::
make_ddim
(
anakin_output_shape
));
auto
*
fluid_data
=
tensor
->
mutable_data
<
float
>
(
gpu_place
);
memory
::
Copy
(
gpu_place
,
static_cast
<
void
*>
(
fluid_data
),
gpu_place
,
static_cast
<
void
*>
(
anakin_data
),
tensor
->
numel
()
*
sizeof
(
float
),
stream
);
}
net_
->
prediction
();
cudaDeviceSynchronize
();
}
...
...
paddle/fluid/inference/anakin/engine.h
浏览文件 @
69d37f81
...
...
@@ -18,6 +18,7 @@
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/engine.h"
...
...
@@ -26,8 +27,12 @@
#include "framework/core/net/net.h"
#include "framework/core/types.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "saber/saber_types.h"
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
namespace
anakin
{
template
<
typename
,
Precision
,
OpRunType
>
...
...
@@ -50,7 +55,7 @@ class AnakinEngine {
using
GraphT
=
::
anakin
::
graph
::
Graph
<
TargetT
,
PrecisionType
>
;
public:
explicit
AnakinEngine
(
bool
need_summary
=
false
);
explicit
AnakinEngine
(
bool
need_summary
=
false
,
int
device
=
0
);
~
AnakinEngine
();
void
InitGraph
();
void
SetInputShape
(
const
std
::
string
&
name
,
std
::
vector
<
int
>
shape
);
...
...
@@ -69,14 +74,50 @@ class AnakinEngine {
void
Freeze
();
void
Optimize
();
void
Save
(
std
::
string
path
)
{
graph_
->
save
(
path
);
}
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// }
// void LoadSerializedData(const std::string& data) {
// graph_->load_from_string(data); }
void
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
);
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
);
private:
int
device_
;
std
::
unique_ptr
<
GraphT
>
graph_
;
std
::
unique_ptr
<
NetT
>
net_
;
};
class
AnakinEngineManager
{
using
AnakinNvEngineT
=
AnakinEngine
<
NV
,
Precision
::
FP32
>
;
public:
bool
HasEngine
(
const
std
::
string
&
name
)
const
{
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
}
AnakinNvEngineT
*
Get
(
const
std
::
string
&
name
)
const
{
return
engines_
.
at
(
name
).
get
();
}
AnakinNvEngineT
*
Create
(
bool
need_summary
,
int
device
,
std
::
string
engine_name
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
auto
*
p
=
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
need_summary
,
device
);
engines_
[
engine_name
].
reset
(
p
);
return
p
;
}
void
DeleteALL
()
{
for
(
auto
&
item
:
engines_
)
{
item
.
second
.
reset
(
nullptr
);
}
}
private:
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
AnakinNvEngineT
>>
engines_
;
std
::
mutex
mut_
;
};
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/test_anakin_engine.cc
浏览文件 @
69d37f81
...
...
@@ -17,9 +17,6 @@ limitations under the License. */
#include <map>
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "paddle/fluid/inference/anakin/engine.h"
using
anakin
::
graph
::
GraphGlobalMem
;
...
...
@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
auto
*
y_data
=
y
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
());
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
outputs
=
{{
"y"
,
&
y
}};
engine_
->
Execute
(
inputs
,
outputs
);
cudaStream_t
stream
;
engine_
->
Execute
(
inputs
,
outputs
,
stream
);
auto
*
y_data_gpu
=
y_data
;
float
y_data_cpu
[
2
];
cudaMemcpy
(
y_data_cpu
,
y_data_gpu
,
sizeof
(
float
)
*
2
,
cudaMemcpyDeviceToHost
);
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
69d37f81
...
...
@@ -23,6 +23,7 @@
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
...
...
@@ -55,6 +56,7 @@ struct Argument {
using
unique_ptr_t
=
std
::
unique_ptr
<
void
,
std
::
function
<
void
(
void
*
)
>>
;
using
fusion_statis_t
=
std
::
unordered_map
<
std
::
string
,
int
>
;
using
engine_opt_info_t
=
std
::
map
<
std
::
string
,
std
::
string
>
;
bool
Has
(
const
std
::
string
&
key
)
const
{
return
valid_fields_
.
count
(
key
);
}
...
...
@@ -107,12 +109,14 @@ struct Argument {
private: \
unique_ptr_t field__##_;
DECL_ARGUMENT_FIELD
(
predictor_id
,
PredictorID
,
int
);
// Model path
DECL_ARGUMENT_FIELD
(
model_dir
,
ModelDir
,
std
::
string
);
// Model specified with program and parameters files.
DECL_ARGUMENT_FIELD
(
model_program_path
,
ModelProgramPath
,
std
::
string
);
DECL_ARGUMENT_FIELD
(
model_params_path
,
ModelParamsPath
,
std
::
string
);
DECL_ARGUMENT_FIELD
(
model_from_memory
,
ModelFromMemory
,
bool
);
DECL_ARGUMENT_FIELD
(
engine_opt_info
,
EngineOptInfo
,
engine_opt_info_t
);
// The overall graph to work on.
DECL_ARGUMENT_UNIQUE_FIELD
(
main_graph
,
MainGraph
,
framework
::
ir
::
Graph
);
...
...
@@ -146,6 +150,8 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_use_static_engine
,
TensorRtUseStaticEngine
,
bool
);
DECL_ARGUMENT_FIELD
(
use_anakin
,
UseAnakin
,
bool
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim
,
StaticMemoryOptim
,
bool
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
69d37f81
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
...
...
@@ -71,6 +72,11 @@ void IRPassManager::CreatePasses(Argument *argument,
if
(
pass_name
==
"anakin_subgraph_pass"
)
{
pass
->
Set
(
"program"
,
new
framework
::
ProgramDesc
*
(
&
argument
->
main_program
()));
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"model_from_memory"
,
new
bool
(
argument
->
model_from_memory
()));
pass
->
Set
(
"engine_opt_info"
,
new
std
::
map
<
std
::
string
,
std
::
string
>
(
argument
->
engine_opt_info
()));
pass
->
Set
(
"predictor_id"
,
new
int
(
argument
->
predictor_id
()));
}
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
...
...
@@ -95,6 +101,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"use_static_engine"
,
new
bool
(
argument
->
tensorrt_use_static_engine
()));
pass
->
Set
(
"model_from_memory"
,
new
bool
(
argument
->
model_from_memory
()));
pass
->
Set
(
"engine_opt_info"
,
new
std
::
map
<
std
::
string
,
std
::
string
>
(
argument
->
engine_opt_info
()));
}
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
浏览文件 @
69d37f81
...
...
@@ -21,6 +21,7 @@
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/op_teller.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
...
...
@@ -45,12 +46,20 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
return
anakin
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
3
/* min_subgraph_size */
);
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
0
/* min_subgraph_size */
);
fuser
();
std
::
vector
<
std
::
string
>
graph_param_names
=
ExtractAnakinParameters
(
graph
->
Nodes
());
// those parameter already exist in anakin, and should not have another copy
// in
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateAnakinOp
(
node
,
graph
.
get
());
CreateAnakinOp
(
node
,
graph
.
get
()
,
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
Agent
(
node
).
subgraph
()
->
begin
(),
Agent
(
node
).
subgraph
()
->
end
());
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
...
...
@@ -64,13 +73,15 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
}
}
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
graph
->
Set
(
framework
::
ir
::
kRepetitiveParamAttr
,
new
std
::
vector
<
std
::
string
>
(
repetitive_params
));
return
graph
;
}
std
::
string
GenerateAnakinEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_in
puts
,
const
std
::
set
<
std
::
string
>
&
engine_outputs
)
{
std
::
string
GenerateAnakinEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
const
std
::
set
<
std
::
string
>
&
engine_out
puts
,
std
::
string
id
)
{
std
::
string
engine_hash_key
=
""
;
for
(
auto
name
:
engine_inputs
)
{
engine_hash_key
+=
name
;
...
...
@@ -78,12 +89,15 @@ std::string GenerateAnakinEngineKey(
for
(
auto
name
:
engine_outputs
)
{
engine_hash_key
+=
name
;
}
engine_hash_key
+=
id
;
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
return
engine_key
;
}
void
AnakinSubgraphPass
::
CreateAnakinOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
)
const
{
void
AnakinSubgraphPass
::
CreateAnakinOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
{
auto
*
op_desc
=
node
->
Op
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
...
...
@@ -117,10 +131,16 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
// is unique.
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names_with_id
;
std
::
vector
<
std
::
string
>
params
;
for
(
auto
*
x
:
node
->
inputs
)
{
input_names
.
insert
(
x
->
Name
());
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
if
(
std
::
count
(
graph_params
.
begin
(),
graph_params
.
end
(),
x
->
Name
())
>
0
)
{
params
.
push_back
(
x
->
Name
());
}
}
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
...
...
@@ -231,10 +251,25 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractAnakinParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
auto
engine_key
=
GenerateAnakinEngineKey
(
input_names_with_id
,
output_names_with_id
);
int
predictor_id
=
Get
<
int
>
(
"predictor_id"
);
auto
engine_key
=
GenerateAnakinEngineKey
(
input_names_with_id
,
output_names_with_id
,
std
::
to_string
(
predictor_id
));
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
auto
*
anakin_engine
=
inference
::
Singleton
<
anakin
::
AnakinEngineManager
>::
Global
().
Create
(
true
,
Get
<
int
>
(
"gpu_device_id"
),
engine_key
);
auto
*
scope
=
param_scope
();
std
::
unordered_set
<
std
::
string
>
param_set
(
params
.
begin
(),
params
.
end
());
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
.
Proto
());
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
>::
Global
()
.
ConvertBlockToAnakinEngine
(
&
block_desc_temp
,
*
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
anakin_engine
);
}
std
::
vector
<
std
::
string
>
ExtractAnakinParameters
(
...
...
@@ -246,7 +281,7 @@ std::vector<std::string> ExtractAnakinParameters(
for
(
const
auto
&
node
:
nodes
)
{
if
(
!
node
->
IsOp
())
continue
;
std
::
string
op_type
=
node
->
Op
()
->
Type
();
if
(
op_type
==
"feed"
)
{
if
(
op_type
==
"feed"
||
op_type
==
"fetch"
)
{
std
::
vector
<
std
::
string
>
output_names
=
node
->
Op
()
->
OutputArgumentNames
();
std
::
copy
(
output_names
.
begin
(),
output_names
.
end
(),
std
::
back_inserter
(
feed_outputs
));
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
浏览文件 @
69d37f81
...
...
@@ -15,8 +15,13 @@
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/anakin/engine.h"
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
...
...
@@ -27,8 +32,9 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
override
;
private:
void
CreateAnakinOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
)
const
;
void
CreateAnakinOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
;
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
};
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
69d37f81
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include <algorithm>
#include <map>
#include <set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...
...
@@ -219,7 +220,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
std
::
string
(
""
));
bool
load_from_memory
=
Get
<
bool
>
(
"model_from_memory"
);
std
::
string
trt_engine_serialized_data
=
""
;
if
(
load_from_memory
)
{
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
=
Get
<
std
::
map
<
std
::
string
,
std
::
string
>>
(
"engine_opt_info"
);
if
(
engine_opt_info
.
count
(
engine_key
))
{
trt_engine_serialized_data
=
engine_opt_info
[
engine_key
];
}
}
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
trt_engine_serialized_data
);
std
::
unique_ptr
<
tensorrt
::
TRTInt8Calibrator
>
calibrator
;
if
(
enable_int8
&&
calibration_data
.
size
()
!=
0
)
{
...
...
@@ -230,10 +241,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool
calibration_mode
=
(
enable_int8
&&
calibration_data
.
size
()
==
0
);
if
(
!
calibration_mode
&&
use_static_engine
)
{
if
(
!
calibration_mode
&&
use_static_engine
&&
trt_engine_serialized_data
.
empty
())
{
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
std
::
string
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
if
(
trt_engine_serialized_data
.
empty
())
{
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
69d37f81
...
...
@@ -64,8 +64,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
anakin_target
(
inference_anakin_api
)
anakin_target
(
inference_anakin_api_shared
)
endif
()
if
(
WITH_ANAKIN_SUBGRAPH
)
inference_analysis_test
(
test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid
)
endif
()
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
69d37f81
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/platform/gpu_info.h"
namespace
paddle
{
extern
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
;
PassStrategy
*
AnalysisConfig
::
pass_builder
()
const
{
if
(
!
pass_builder_
.
get
())
{
...
...
@@ -230,6 +231,20 @@ void AnalysisConfig::Update() {
}
}
if
(
use_anakin_
)
{
PADDLE_ENFORCE
(
!
use_tensorrt_
,
"Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!"
);
PADDLE_ENFORCE
(
use_gpu_
,
"Anakin sub-graph engine need gpu, please use the EnableGpu API."
);
pass_builder
()
->
ClearPasses
();
for
(
const
auto
&
pass
:
kAnakinSubgraphPasses
)
{
pass_builder
()
->
AppendPass
(
pass
);
}
}
if
(
ir_debug_
)
{
pass_builder
()
->
TurnOnDebug
();
}
...
...
@@ -266,7 +281,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss
<<
specify_input_name_
;
ss
<<
cpu_math_library_num_threads_
;
ss
<<
use_anakin_
;
return
ss
.
str
();
}
...
...
@@ -316,6 +331,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update
();
}
void
AnalysisConfig
::
SetEngineOptInfo
(
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
)
{
engine_opt_info_
=
engine_opt_info
;
}
NativeConfig
AnalysisConfig
::
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
...
...
@@ -332,5 +352,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_
=
x
;
Update
();
}
void
AnalysisConfig
::
EnableAnakinEngine
()
{
use_anakin_
=
true
;
Update
();
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
69d37f81
...
...
@@ -351,7 +351,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetStaticMemoryOptimForceUpdate
(
config_
.
static_memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
argument_
.
SetEngineOptInfo
(
config_
.
engine_opt_info_
);
// Analyze inference_program
argument_
.
SetUseAnakin
(
config_
.
anakin_engine_enabled
());
argument_
.
SetPredictorID
(
predictor_id_
);
if
(
!
config_
.
model_dir
().
empty
())
{
argument_
.
SetModelDir
(
config_
.
model_dir
());
}
else
{
...
...
@@ -375,6 +378,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetTensorRtUseStaticEngine
(
config_
.
trt_use_static_engine_
);
}
if
(
config_
.
use_gpu
()
&&
config_
.
anakin_engine_enabled
())
{
LOG
(
INFO
)
<<
"Anakin subgraph engine is enabled"
;
}
if
(
config_
.
use_mkldnn_
)
{
LOG
(
INFO
)
<<
"MKLDNN is enabled"
;
argument_
.
SetMKLDNNEnabledOpTypes
(
config_
.
mkldnn_enabled_op_types_
);
...
...
@@ -404,7 +411,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
())
{
// 1. GPU memory
PADDLE_ENFORCE_G
T
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_G
E
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_GE
(
config
.
gpu_device_id
(),
0
,
"Invalid device id %d"
,
config
.
gpu_device_id
());
std
::
vector
<
std
::
string
>
flags
;
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
69d37f81
...
...
@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
*/
class
AnalysisPredictor
:
public
PaddlePredictor
{
public:
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{}
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{
predictor_id_
=
inference
::
GetUniqueId
();
}
~
AnalysisPredictor
();
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
...
...
@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
const
size_t
max_shape_collect_count_
{
1000
};
int
need_collect_var_shapes_
{
-
1
};
// -1 for default, 0 for false, 1 for true.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_var_shapes_
;
int
predictor_id_
;
private:
// Some status here that help to determine the status inside the predictor.
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
69d37f81
...
...
@@ -14,9 +14,11 @@
#pragma once
#include <cassert>
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
/*! \file */
...
...
@@ -140,6 +142,14 @@ struct AnalysisConfig {
/** A boolean state telling whether the TensorRT engine is used.
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
/**
* \brief Turn on the usage of Anakin sub-graph engine.
*/
void
EnableAnakinEngine
();
/** A boolean state indicating whether the Anakin sub-graph engine is used.
*/
bool
anakin_engine_enabled
()
const
{
return
use_anakin_
;
}
/** \brief Control whether to debug IR graph analysis phase.
*
...
...
@@ -185,6 +195,7 @@ struct AnalysisConfig {
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
void
SetEngineOptInfo
(
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
);
/** Turn on memory optimize
* NOTE still in development, will release latter.
...
...
@@ -258,6 +269,8 @@ struct AnalysisConfig {
std
::
string
serialized_info_cache_
;
mutable
std
::
unique_ptr
<
PassStrategy
>
pass_builder_
;
bool
use_anakin_
{
false
};
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info_
;
};
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
69d37f81
...
...
@@ -68,6 +68,17 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
}
// The following passes works for Anakin sub-graph engine.
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
({
"infer_clean_graph_pass"
,
//
"simplify_anakin_detection_pattern_pass3"
,
//
"fc_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"anakin_subgraph_pass"
,
});
GpuPassStrategy
::
GpuPassStrategy
()
:
PassStrategy
({})
{
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
...
...
@@ -120,4 +131,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
});
use_gpu_
=
false
;
}
void
PaddlePassBuilder
::
ClearPasses
()
{
passes_
.
clear
();
}
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
69d37f81
...
...
@@ -45,6 +45,7 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
void
ClearPasses
();
/** Append an analysis pass. */
void
AppendAnalysisPass
(
const
std
::
string
&
pass
);
...
...
@@ -142,4 +143,6 @@ class GpuPassStrategy : public PassStrategy {
virtual
~
GpuPassStrategy
()
=
default
;
};
extern
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
;
}
// namespace paddle
paddle/fluid/operators/anakin/anakin_engine_op.h
浏览文件 @
69d37f81
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <fstream>
#include <map>
#include <memory>
#include <string>
...
...
@@ -52,8 +53,9 @@ class AnakinEngineOp : public framework::OperatorBase {
private:
std
::
vector
<
std
::
string
>
input_names_
;
std
::
unordered_set
<
std
::
string
>
param_names_
;
mutable
std
::
unique_ptr
<
AnakinNvEngineT
>
anakin_engine_
;
mutable
AnakinNvEngineT
*
anakin_engine_
;
std
::
string
engine_key_
;
std
::
string
engine_serialized_data_
;
public:
AnakinEngineOp
(
const
std
::
string
&
type
,
...
...
@@ -67,6 +69,7 @@ class AnakinEngineOp : public framework::OperatorBase {
for
(
const
auto
&
param
:
params
)
{
param_names_
.
insert
(
param
);
}
anakin_engine_
=
nullptr
;
}
protected:
...
...
@@ -77,12 +80,12 @@ class AnakinEngineOp : public framework::OperatorBase {
void
RunAnakin
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
anakin_engine_
.
get
()
==
nullptr
)
{
anakin_engine_
.
reset
(
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
true
));
Prepare
(
scope
,
dev_place
,
anakin_engine_
.
get
());
}
auto
*
engine
=
GetEngine
(
scope
,
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
auto
*
engine
=
anakin_engine_
.
get
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
...
...
@@ -95,24 +98,48 @@ class AnakinEngineOp : public framework::OperatorBase {
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
*
anakin_input
=
engine
->
Net
()
->
get_in
(
x
);
auto
net_shape
=
anakin_input
->
shape
();
size_t
anakin_net_input_size
=
net_shape
.
count
()
*
sizeof
(
float
);
size_t
fluid_input_size
=
t
.
memory_size
();
if
(
fluid_input_size
<
anakin_net_input_size
)
{
framework
::
LoDTensor
temp_t
;
auto
t_dims
=
t
.
dims
();
temp_t
.
Resize
(
t_dims
);
TensorCopySync
(
t
,
dev_place
,
&
temp_t
);
t
.
Resize
(
framework
::
make_ddim
(
net_shape
));
t
.
mutable_data
<
float
>
(
dev_place
);
TensorCopySync
(
temp_t
,
dev_place
,
&
t
);
}
inputs
.
insert
({
x
,
&
t
});
}
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
outputs
;
int
output_index
=
0
;
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
std
::
vector
<
int
>
ddim
=
engine
->
Net
()
->
get_out
(
output_maps
[
output_index
])
->
valid_shape
();
//
std::vector<int> ddim =
//
engine->Net()->get_out(output_maps[output_index])->valid_shape();
// we need get the output anakin output shape.
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
fluid_t
->
mutable_data
<
float
>
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
));
//
fluid_t->Resize(framework::make_ddim(ddim));
//
fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
outputs
.
insert
({
output_maps
[
output_index
],
fluid_t
});
output_index
+=
1
;
}
engine
->
Execute
(
inputs
,
outputs
);
engine
->
Execute
(
inputs
,
outputs
,
stream
);
}
AnakinNvEngineT
*
GetEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
anakin_engine_
==
nullptr
)
{
anakin_engine_
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
>::
Global
()
.
Get
(
engine_key_
);
}
return
anakin_engine_
;
}
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
...
...
@@ -128,8 +155,6 @@ class AnakinEngineOp : public framework::OperatorBase {
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
>::
Global
()
.
ConvertBlock
(
block_desc
,
param_names_
,
scope
,
engine
);
engine
->
Freeze
();
engine
->
Optimize
();
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
auto
&
t
=
...
...
@@ -142,6 +167,9 @@ class AnakinEngineOp : public framework::OperatorBase {
}
engine
->
SetInputShape
(
x
,
t_shape
);
}
engine
->
Optimize
();
engine
->
InitGraph
();
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录