Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
69d37f81
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
69d37f81
编写于
3月 20, 2019
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cherry-pick from feature/anakin-engine: refine anakin subgraph. #16157
support change input size
上级
a1d200a5
变更
27
显示空白变更内容
内联
并排
Showing
27 changed file
with
390 addition
and
80 deletion
+390
-80
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+14
-2
paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
...id/framework/ir/simplify_anakin_detection_pattern_pass.cc
+10
-3
paddle/fluid/inference/anakin/CMakeLists.txt
paddle/fluid/inference/anakin/CMakeLists.txt
+1
-1
paddle/fluid/inference/anakin/convert/batch_norm.cc
paddle/fluid/inference/anakin/convert/batch_norm.cc
+2
-0
paddle/fluid/inference/anakin/convert/density_prior_box.cc
paddle/fluid/inference/anakin/convert/density_prior_box.cc
+21
-10
paddle/fluid/inference/anakin/convert/op_converter.h
paddle/fluid/inference/anakin/convert/op_converter.h
+30
-0
paddle/fluid/inference/anakin/convert/pool2d.cc
paddle/fluid/inference/anakin/convert/pool2d.cc
+1
-1
paddle/fluid/inference/anakin/convert/softmax.cc
paddle/fluid/inference/anakin/convert/softmax.cc
+1
-1
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+2
-1
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+41
-0
paddle/fluid/inference/anakin/convert/ut_helper.h
paddle/fluid/inference/anakin/convert/ut_helper.h
+1
-1
paddle/fluid/inference/anakin/engine.cc
paddle/fluid/inference/anakin/engine.cc
+31
-17
paddle/fluid/inference/anakin/engine.h
paddle/fluid/inference/anakin/engine.h
+43
-2
paddle/fluid/inference/anakin/test_anakin_engine.cc
paddle/fluid/inference/anakin/test_anakin_engine.cc
+3
-4
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+9
-0
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
...luid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+45
-10
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
...fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+8
-2
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+15
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+0
-5
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+25
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-1
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+4
-1
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+13
-0
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+12
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+3
-0
paddle/fluid/operators/anakin/anakin_engine_op.h
paddle/fluid/operators/anakin/anakin_engine_op.h
+41
-13
未找到文件。
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
69d37f81
...
...
@@ -1527,6 +1527,16 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
->
assert_is_op_output
(
"box_coder"
)
->
AsIntermediate
();
auto
transpose_before_nms
=
pattern
->
NewNode
(
GetNodeName
(
"transpose_before_nms"
))
->
assert_is_op
(
"transpose2"
);
auto
transpose_before_nms_out
=
pattern
->
NewNode
(
GetNodeName
(
"transpose_before_nms_out"
))
->
assert_is_op_output
(
"transpose2"
)
->
assert_is_op_input
(
"multiclass_nms"
,
"Scores"
)
->
AsIntermediate
();
auto
multiclass_nms_op
=
pattern
->
NewNode
(
GetNodeName
(
"multiclass_nms"
))
->
assert_is_op
(
"multiclass_nms"
)
->
assert_op_has_n_inputs
(
"multiclass_nms"
,
2
);
...
...
@@ -1577,8 +1587,10 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
{
concat_out1
,
concat_out2
,
conv_in
[
kBoxCoderThirdInputOffset
]});
box_coder_out
->
LinksFrom
({
box_coder_op
});
multiclass_nms_op
->
LinksFrom
({
box_coder_out
,
conv_in
[
kMultiClassSecondInputNmsOffset
]})
transpose_before_nms
->
LinksFrom
({
conv_in
[
kMultiClassSecondInputNmsOffset
]});
transpose_before_nms_out
->
LinksFrom
({
transpose_before_nms
});
multiclass_nms_op
->
LinksFrom
({
box_coder_out
,
transpose_before_nms_out
})
.
LinksTo
({
multiclass_nms_out
});
return
multiclass_nms_out
;
...
...
paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
浏览文件 @
69d37f81
...
...
@@ -45,7 +45,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
input_nodes
.
push_back
(
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
+
std
::
to_string
(
times
+
1
))
->
assert_is_op_input
(
"
multiclass_nms"
,
"Scores
"
)
->
assert_is_op_input
(
"
transpose2
"
)
->
AsInput
());
patterns
::
AnakinDetectionPattern
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
...
...
@@ -106,6 +106,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
Node
*
box_coder_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"box_coder_out"
));
Node
*
multiclass_nms_second_input
=
subgraph
.
at
(
input_nodes
[
times
+
1
]);
Node
*
transpose_before_nms
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_before_nms"
));
Node
*
transpose_before_nms_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_before_nms_out"
));
Node
*
multiclass_nms
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"multiclass_nms"
));
Node
*
multiclass_nms_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"multiclass_nms_out"
));
...
...
@@ -133,11 +138,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
nodes
[
i
*
kNumFields
+
kPriorBoxLocOffset
]
->
Name
());
}
int
axis
=
boost
::
get
<
int
>
(
concat_op1
->
Op
()
->
GetAttr
(
"axis"
));
//
int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
framework
::
OpDesc
concat1_desc
;
concat1_desc
.
SetType
(
"concat"
);
concat1_desc
.
SetInput
(
"X"
,
concat1_input_names
);
concat1_desc
.
SetAttr
(
"axis"
,
axis
);
concat1_desc
.
SetAttr
(
"axis"
,
2
);
concat1_desc
.
SetOutput
(
"Out"
,
{
concat_out1
->
Name
()});
auto
*
new_add_concat_op
=
graph
->
CreateOpNode
(
&
concat1_desc
);
...
...
@@ -184,6 +189,8 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
delete_nodes
.
insert
(
concat_out2
);
delete_nodes
.
insert
(
box_coder_op
);
delete_nodes
.
insert
(
box_coder_out
);
delete_nodes
.
insert
(
transpose_before_nms
);
delete_nodes
.
insert
(
transpose_before_nms_out
);
delete_nodes
.
insert
(
multiclass_nms
);
new_add_concat_op
->
outputs
.
push_back
(
concat_out1
);
...
...
paddle/fluid/inference/anakin/CMakeLists.txt
浏览文件 @
69d37f81
cc_library
(
anakin_engine SRCS engine.cc
)
nv
_library
(
anakin_op_teller SRCS op_teller.cc DEPS framework_proto
)
cc
_library
(
anakin_op_teller SRCS op_teller.cc DEPS framework_proto
)
target_link_libraries
(
anakin_engine anakin anakin_saber_common
)
cc_test
(
test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine
)
add_subdirectory
(
convert
)
paddle/fluid/inference/anakin/convert/batch_norm.cc
浏览文件 @
69d37f81
...
...
@@ -43,11 +43,13 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
auto
output
=
op_desc
.
Output
(
"Y"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Y"
).
front
();
auto
epsilon
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"epsilon"
));
// auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
auto
bn_op_name
=
op_name
+
":bn"
;
auto
bn_output
=
bn_op_name
+
"_output"
;
engine_
->
AddOp
(
bn_op_name
,
"BatchNorm"
,
{
inputs
[
"X"
]},
{
bn_output
});
engine_
->
AddOpAttr
(
bn_op_name
,
"epsilon"
,
epsilon
);
engine_
->
AddOpAttr
(
bn_op_name
,
"momentum"
,
static_cast
<
float
>
(
1.0
));
auto
scale_op_name
=
op_name
+
":scale"
;
auto
get_lod_tensor
=
[
this
,
&
scope
,
&
op_name
](
const
std
::
string
&
var_name
,
...
...
paddle/fluid/inference/anakin/convert/density_prior_box.cc
浏览文件 @
69d37f81
...
...
@@ -27,8 +27,8 @@ namespace paddle {
namespace
inference
{
namespace
anakin
{
void
DensityPriorBoxOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
void
DensityPriorBoxOpConverter
::
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
{
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
input_name
=
op_desc
.
Input
(
"Input"
).
front
();
...
...
@@ -42,34 +42,45 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
auto
fixed_ratios
=
boost
::
get
<
std
::
vector
<
float
>>
(
op_desc
.
GetAttr
(
"fixed_ratios"
));
auto
densities
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"densities"
));
std
::
vector
<
float
>
dens
;
for
(
auto
&
ele
:
densities
)
{
dens
.
push_back
(
static_cast
<
float
>
(
ele
));
}
// lack flip
auto
clip
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"clip"
));
//
auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
auto
variances
=
boost
::
get
<
std
::
vector
<
float
>>
(
op_desc
.
GetAttr
(
"variances"
));
for
(
auto
&
ele
:
variances
)
{
LOG
(
INFO
)
<<
ele
;
}
// lack img_h, img_w
auto
step_h
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"step_h"
));
auto
step_w
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"step_w"
));
auto
offset
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"offset"
));
std
::
vector
<
std
::
string
>
order
=
{
"MIN"
,
"COM"
,
"MAX"
};
PTuple
<
std
::
string
>
t_order
;
t_order
.
push_back
(
"MIN"
);
t_order
.
push_back
(
"COM"
);
t_order
.
push_back
(
"MAX"
);
std
::
vector
<
float
>
temp_v
=
{};
engine_
->
AddOp
(
op_name
,
"PriorBox"
,
{
input_name
,
image_name
},
{
output_name
});
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"min_size"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"max_size"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"aspect_ratio"
,
temp_v
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_size
s
"
,
fixed_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_ratio
s
"
,
fixed_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
int
>>
(
op_name
,
"density"
,
densitie
s
);
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
false
);
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
clip
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_size"
,
fixed_sizes
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"fixed_ratio"
,
fixed_ratios
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"density"
,
den
s
);
engine_
->
AddOpAttr
(
op_name
,
"is_flip"
,
static_cast
<
bool
>
(
false
)
);
engine_
->
AddOpAttr
(
op_name
,
"is_clip"
,
static_cast
<
bool
>
(
false
)
);
engine_
->
AddOpAttr
<
PTuple
<
float
>>
(
op_name
,
"variance"
,
variances
);
engine_
->
AddOpAttr
(
op_name
,
"img_h"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"img_w"
,
static_cast
<
int
>
(
0
));
engine_
->
AddOpAttr
(
op_name
,
"step_h"
,
step_h
);
engine_
->
AddOpAttr
(
op_name
,
"step_w"
,
step_w
);
engine_
->
AddOpAttr
(
op_name
,
"offset"
,
offset
);
engine_
->
AddOpAttr
<
PTuple
<
std
::
string
>>
(
op_name
,
"order"
,
order
);
engine_
->
AddOpAttr
<
PTuple
<
std
::
string
>>
(
op_name
,
"order"
,
t_
order
);
}
}
// namespace anakin
...
...
paddle/fluid/inference/anakin/convert/op_converter.h
浏览文件 @
69d37f81
...
...
@@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "framework/core/types.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -68,6 +69,35 @@ class AnakinOpConverter {
ConvertOp
(
op
,
parameters
,
scope
,
engine
);
}
}
// The scope here should be inited with the parameter vars.
void
ConvertBlockToAnakinEngine
(
framework
::
BlockDesc
*
block_desc
,
const
framework
::
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>
&
inputs
,
const
std
::
unordered_set
<
std
::
string
>
&
parameters
,
const
std
::
vector
<
std
::
string
>
&
outputs
,
AnakinNvEngine
*
engine
)
{
framework
::
proto
::
BlockDesc
*
block_proto
=
block_desc
->
Proto
();
ConvertBlock
(
*
block_proto
,
parameters
,
scope
,
engine
);
engine
->
Freeze
();
for
(
auto
&
input
:
inputs
)
{
if
(
parameters
.
count
(
input
))
continue
;
auto
*
var
=
block_desc
->
FindVar
(
input
);
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
auto
var_shape
=
var
->
GetShape
();
PADDLE_ENFORCE
(
var_shape
.
size
()
==
4
);
std
::
vector
<
int
>
input_shape
;
for
(
int
i
=
0
;
i
<
var_shape
.
size
();
i
++
)
{
input_shape
.
push_back
(
var_shape
[
i
]);
}
input_shape
[
0
]
=
1
;
engine
->
SetInputShape
(
input
,
input_shape
);
}
engine
->
Optimize
();
engine
->
InitGraph
();
}
void
SetEngine
(
AnakinNvEngine
*
engine
)
{
engine_
=
engine
;
}
virtual
~
AnakinOpConverter
()
{}
...
...
paddle/fluid/inference/anakin/convert/pool2d.cc
浏览文件 @
69d37f81
...
...
@@ -55,7 +55,7 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
if
(
pool_type
==
"max"
)
{
anakin_pool_type
=
"MAX"
;
}
else
if
(
pool_type
==
"avg"
)
{
anakin_pool_type
=
"AVG"
;
anakin_pool_type
=
"AVG
EXC
"
;
}
else
{
PADDLE_THROW
(
"TensorRT unsupported pooling type!"
);
}
...
...
paddle/fluid/inference/anakin/convert/softmax.cc
浏览文件 @
69d37f81
...
...
@@ -33,7 +33,7 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto
output
=
op_desc
.
Output
(
"Out"
).
front
();
auto
op_name
=
op_desc
.
Type
()
+
":"
+
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
AddOp
(
op_name
,
"Softmax"
,
{
input
},
{
output
});
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
1
);
engine_
->
AddOpAttr
(
op_name
,
"axis"
,
2
);
}
}
// namespace anakin
...
...
paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
浏览文件 @
69d37f81
...
...
@@ -52,8 +52,9 @@ TEST(batch_norm_op, test) {
desc
.
SetOutput
(
"SavedVariance"
,
{
"batch_norm_save_variance"
});
float
eps
=
1e-5
f
;
bool
is_test
=
true
;
desc
.
SetAttr
(
"epsilon"
,
eps
);
desc
.
SetAttr
(
"is_test"
,
true
);
desc
.
SetAttr
(
"is_test"
,
is_test
);
validator
.
SetOp
(
*
desc
.
Proto
());
...
...
paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
浏览文件 @
69d37f81
...
...
@@ -64,11 +64,52 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
validator
.
Execute
(
1
);
}
void
test_pool2d2
(
bool
global_pooling
,
bool
ceil_mode
,
std
::
string
pool_type
=
"max"
)
{
auto
*
pool2d_converter
=
Registry
<
AnakinOpConverter
>::
Global
().
Lookup
(
"pool2d"
);
ASSERT_TRUE
(
pool2d_converter
);
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
AnakinConvertValidation
validator
(
parameters
,
scope
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator
.
DeclInputVar
(
"pool2d_x"
,
{
1
,
1
,
17
,
17
});
validator
.
DeclOutputVar
(
"pool2d_out"
,
{
1
,
1
,
17
,
17
});
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"pool2d"
);
desc
.
SetInput
(
"X"
,
{
"pool2d_x"
});
desc
.
SetOutput
(
"Out"
,
{
"pool2d_out"
});
std
::
vector
<
int
>
ksize
({
3
,
3
});
std
::
vector
<
int
>
strides
({
1
,
1
});
std
::
vector
<
int
>
paddings
({
1
,
1
});
std
::
string
pooling_t
=
pool_type
;
desc
.
SetAttr
(
"pooling_type"
,
pooling_t
);
desc
.
SetAttr
(
"ksize"
,
ksize
);
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"global_pooling"
,
global_pooling
);
desc
.
SetAttr
(
"ceil_mode"
,
true
);
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
1
);
}
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
test_pool2d
(
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
max_ceil_test
)
{
test_pool2d
(
false
,
true
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test
)
{
test_pool2d
(
false
,
true
,
"avg"
);
}
TEST
(
Pool2dOpConverter
,
avg_ceil_test2
)
{
test_pool2d2
(
false
,
true
,
"avg"
);
}
}
// namespace anakin
}
// namespace inference
...
...
paddle/fluid/inference/anakin/convert/ut_helper.h
浏览文件 @
69d37f81
...
...
@@ -168,7 +168,7 @@ class AnakinConvertValidation {
outputs
.
insert
({
output
,
tensor
});
}
engine_
->
Execute
(
inputs
,
outputs
);
engine_
->
Execute
(
inputs
,
outputs
,
stream_
);
int
i_output
=
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
if
(
neglected_output
.
count
(
output
))
continue
;
...
...
paddle/fluid/inference/anakin/engine.cc
浏览文件 @
69d37f81
...
...
@@ -33,9 +33,12 @@ namespace inference {
namespace
anakin
{
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
AnakinEngine
(
bool
need_summary
)
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
AnakinEngine
(
bool
need_summary
,
int
device
)
:
graph_
(
new
AnakinGraphT
<
TargetT
,
PrecisionType
>
()),
net_
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
need_summary
))
{}
net_
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
need_summary
))
{
device_
=
device
;
}
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::~
AnakinEngine
()
{}
...
...
@@ -63,33 +66,44 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
template
<
typename
TargetT
,
Precision
PrecisionType
,
OpRunType
RunType
>
void
AnakinEngine
<
TargetT
,
PrecisionType
,
RunType
>::
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
)
{
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
)
{
for
(
const
auto
&
input
:
inputs
)
{
auto
*
tensor
=
input
.
second
;
auto
*
data
=
tensor
->
data
<
float
>
();
auto
shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
fluid_input_shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
*
anakin_input
=
net_
->
get_in
(
input
.
first
);
auto
anakin_input_shape
=
anakin_input
->
valid_shape
();
PADDLE_ENFORCE
(
tensor
->
numel
(),
anakin_input_shape
.
count
(),
"the fluid input size should be equal to anakin"
);
auto
net_shape
=
anakin_input
->
shape
();
if
(
tensor
->
numel
()
>
net_shape
.
count
())
{
graph_
->
Reshape
(
input
.
first
,
fluid_input_shape
);
net_
.
reset
(
new
AnakinNetT
<
TargetT
,
PrecisionType
,
RunType
>
(
true
));
net_
->
init
(
*
graph_
);
anakin_input
=
net_
->
get_in
(
input
.
first
);
}
anakin_input
->
reshape
(
fluid_input_shape
);
net_shape
=
anakin_input
->
shape
();
::
anakin
::
saber
::
Tensor
<
TargetT
>
tmp_anakin_tensor
(
data
,
TargetT
(),
0
,
anakin_inpu
t_shape
);
anakin_input
->
copy
_from
(
tmp_anakin_tensor
);
ne
t_shape
);
anakin_input
->
share
_from
(
tmp_anakin_tensor
);
}
net_
->
prediction
();
for
(
const
auto
&
output
:
outputs
)
{
platform
::
CUDAPlace
gpu_place
(
device_
);
auto
*
tensor
=
output
.
second
;
auto
*
data
=
tensor
->
data
<
float
>
();
auto
shape
=
framework
::
vectorize2int
(
tensor
->
dims
());
auto
*
anakin_output
=
net_
->
get_out
(
output
.
first
);
auto
*
anakin_data
=
anakin_output
->
data
();
auto
anakin_output_shape
=
anakin_output
->
valid_shape
();
PADDLE_ENFORCE
(
tensor
->
numel
(),
anakin_output_shape
.
count
(),
"the fluid output size should be equal to anakin"
);
::
anakin
::
saber
::
Tensor
<
TargetT
>
tmp_anakin_tensor
(
data
,
TargetT
(),
0
,
anakin_output_shape
);
anakin_output
->
share_from
(
tmp_anakin_tensor
);
tensor
->
Resize
(
framework
::
make_ddim
(
anakin_output_shape
));
auto
*
fluid_data
=
tensor
->
mutable_data
<
float
>
(
gpu_place
);
memory
::
Copy
(
gpu_place
,
static_cast
<
void
*>
(
fluid_data
),
gpu_place
,
static_cast
<
void
*>
(
anakin_data
),
tensor
->
numel
()
*
sizeof
(
float
),
stream
);
}
net_
->
prediction
();
cudaDeviceSynchronize
();
}
...
...
paddle/fluid/inference/anakin/engine.h
浏览文件 @
69d37f81
...
...
@@ -18,6 +18,7 @@
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/engine.h"
...
...
@@ -26,8 +27,12 @@
#include "framework/core/net/net.h"
#include "framework/core/types.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "saber/saber_types.h"
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
namespace
anakin
{
template
<
typename
,
Precision
,
OpRunType
>
...
...
@@ -50,7 +55,7 @@ class AnakinEngine {
using
GraphT
=
::
anakin
::
graph
::
Graph
<
TargetT
,
PrecisionType
>
;
public:
explicit
AnakinEngine
(
bool
need_summary
=
false
);
explicit
AnakinEngine
(
bool
need_summary
=
false
,
int
device
=
0
);
~
AnakinEngine
();
void
InitGraph
();
void
SetInputShape
(
const
std
::
string
&
name
,
std
::
vector
<
int
>
shape
);
...
...
@@ -69,14 +74,50 @@ class AnakinEngine {
void
Freeze
();
void
Optimize
();
void
Save
(
std
::
string
path
)
{
graph_
->
save
(
path
);
}
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// }
// void LoadSerializedData(const std::string& data) {
// graph_->load_from_string(data); }
void
Execute
(
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
inputs
,
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
);
const
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
&
outputs
,
cudaStream_t
stream
);
private:
int
device_
;
std
::
unique_ptr
<
GraphT
>
graph_
;
std
::
unique_ptr
<
NetT
>
net_
;
};
class
AnakinEngineManager
{
using
AnakinNvEngineT
=
AnakinEngine
<
NV
,
Precision
::
FP32
>
;
public:
bool
HasEngine
(
const
std
::
string
&
name
)
const
{
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
}
AnakinNvEngineT
*
Get
(
const
std
::
string
&
name
)
const
{
return
engines_
.
at
(
name
).
get
();
}
AnakinNvEngineT
*
Create
(
bool
need_summary
,
int
device
,
std
::
string
engine_name
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
auto
*
p
=
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
need_summary
,
device
);
engines_
[
engine_name
].
reset
(
p
);
return
p
;
}
void
DeleteALL
()
{
for
(
auto
&
item
:
engines_
)
{
item
.
second
.
reset
(
nullptr
);
}
}
private:
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
AnakinNvEngineT
>>
engines_
;
std
::
mutex
mut_
;
};
}
// namespace anakin
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/anakin/test_anakin_engine.cc
浏览文件 @
69d37f81
...
...
@@ -17,9 +17,6 @@ limitations under the License. */
#include <map>
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "paddle/fluid/inference/anakin/engine.h"
using
anakin
::
graph
::
GraphGlobalMem
;
...
...
@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
auto
*
y_data
=
y
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
());
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
outputs
=
{{
"y"
,
&
y
}};
engine_
->
Execute
(
inputs
,
outputs
);
cudaStream_t
stream
;
engine_
->
Execute
(
inputs
,
outputs
,
stream
);
auto
*
y_data_gpu
=
y_data
;
float
y_data_cpu
[
2
];
cudaMemcpy
(
y_data_cpu
,
y_data_gpu
,
sizeof
(
float
)
*
2
,
cudaMemcpyDeviceToHost
);
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
69d37f81
...
...
@@ -23,6 +23,7 @@
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
...
...
@@ -55,6 +56,7 @@ struct Argument {
using
unique_ptr_t
=
std
::
unique_ptr
<
void
,
std
::
function
<
void
(
void
*
)
>>
;
using
fusion_statis_t
=
std
::
unordered_map
<
std
::
string
,
int
>
;
using
engine_opt_info_t
=
std
::
map
<
std
::
string
,
std
::
string
>
;
bool
Has
(
const
std
::
string
&
key
)
const
{
return
valid_fields_
.
count
(
key
);
}
...
...
@@ -107,12 +109,14 @@ struct Argument {
private: \
unique_ptr_t field__##_;
DECL_ARGUMENT_FIELD
(
predictor_id
,
PredictorID
,
int
);
// Model path
DECL_ARGUMENT_FIELD
(
model_dir
,
ModelDir
,
std
::
string
);
// Model specified with program and parameters files.
DECL_ARGUMENT_FIELD
(
model_program_path
,
ModelProgramPath
,
std
::
string
);
DECL_ARGUMENT_FIELD
(
model_params_path
,
ModelParamsPath
,
std
::
string
);
DECL_ARGUMENT_FIELD
(
model_from_memory
,
ModelFromMemory
,
bool
);
DECL_ARGUMENT_FIELD
(
engine_opt_info
,
EngineOptInfo
,
engine_opt_info_t
);
// The overall graph to work on.
DECL_ARGUMENT_UNIQUE_FIELD
(
main_graph
,
MainGraph
,
framework
::
ir
::
Graph
);
...
...
@@ -146,6 +150,8 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_use_static_engine
,
TensorRtUseStaticEngine
,
bool
);
DECL_ARGUMENT_FIELD
(
use_anakin
,
UseAnakin
,
bool
);
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim
,
StaticMemoryOptim
,
bool
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
69d37f81
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
...
...
@@ -71,6 +72,11 @@ void IRPassManager::CreatePasses(Argument *argument,
if
(
pass_name
==
"anakin_subgraph_pass"
)
{
pass
->
Set
(
"program"
,
new
framework
::
ProgramDesc
*
(
&
argument
->
main_program
()));
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"model_from_memory"
,
new
bool
(
argument
->
model_from_memory
()));
pass
->
Set
(
"engine_opt_info"
,
new
std
::
map
<
std
::
string
,
std
::
string
>
(
argument
->
engine_opt_info
()));
pass
->
Set
(
"predictor_id"
,
new
int
(
argument
->
predictor_id
()));
}
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
...
...
@@ -95,6 +101,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"gpu_device_id"
,
new
int
(
argument
->
gpu_device_id
()));
pass
->
Set
(
"use_static_engine"
,
new
bool
(
argument
->
tensorrt_use_static_engine
()));
pass
->
Set
(
"model_from_memory"
,
new
bool
(
argument
->
model_from_memory
()));
pass
->
Set
(
"engine_opt_info"
,
new
std
::
map
<
std
::
string
,
std
::
string
>
(
argument
->
engine_opt_info
()));
}
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
浏览文件 @
69d37f81
...
...
@@ -21,6 +21,7 @@
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/op_teller.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
...
...
@@ -45,12 +46,20 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
return
anakin
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
3
/* min_subgraph_size */
);
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
0
/* min_subgraph_size */
);
fuser
();
std
::
vector
<
std
::
string
>
graph_param_names
=
ExtractAnakinParameters
(
graph
->
Nodes
());
// those parameter already exist in anakin, and should not have another copy
// in
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateAnakinOp
(
node
,
graph
.
get
());
CreateAnakinOp
(
node
,
graph
.
get
()
,
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
Agent
(
node
).
subgraph
()
->
begin
(),
Agent
(
node
).
subgraph
()
->
end
());
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
...
...
@@ -64,13 +73,15 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
}
}
framework
::
ir
::
GraphSafeRemoveNodes
(
graph
.
get
(),
nodes2remove
);
graph
->
Set
(
framework
::
ir
::
kRepetitiveParamAttr
,
new
std
::
vector
<
std
::
string
>
(
repetitive_params
));
return
graph
;
}
std
::
string
GenerateAnakinEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_in
puts
,
const
std
::
set
<
std
::
string
>
&
engine_outputs
)
{
std
::
string
GenerateAnakinEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
const
std
::
set
<
std
::
string
>
&
engine_out
puts
,
std
::
string
id
)
{
std
::
string
engine_hash_key
=
""
;
for
(
auto
name
:
engine_inputs
)
{
engine_hash_key
+=
name
;
...
...
@@ -78,12 +89,15 @@ std::string GenerateAnakinEngineKey(
for
(
auto
name
:
engine_outputs
)
{
engine_hash_key
+=
name
;
}
engine_hash_key
+=
id
;
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
return
engine_key
;
}
void
AnakinSubgraphPass
::
CreateAnakinOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
)
const
{
void
AnakinSubgraphPass
::
CreateAnakinOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
{
auto
*
op_desc
=
node
->
Op
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
...
...
@@ -117,10 +131,16 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
// is unique.
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names_with_id
;
std
::
vector
<
std
::
string
>
params
;
for
(
auto
*
x
:
node
->
inputs
)
{
input_names
.
insert
(
x
->
Name
());
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
if
(
std
::
count
(
graph_params
.
begin
(),
graph_params
.
end
(),
x
->
Name
())
>
0
)
{
params
.
push_back
(
x
->
Name
());
}
}
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
...
...
@@ -231,10 +251,25 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractAnakinParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
auto
engine_key
=
GenerateAnakinEngineKey
(
input_names_with_id
,
output_names_with_id
);
int
predictor_id
=
Get
<
int
>
(
"predictor_id"
);
auto
engine_key
=
GenerateAnakinEngineKey
(
input_names_with_id
,
output_names_with_id
,
std
::
to_string
(
predictor_id
));
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
auto
*
anakin_engine
=
inference
::
Singleton
<
anakin
::
AnakinEngineManager
>::
Global
().
Create
(
true
,
Get
<
int
>
(
"gpu_device_id"
),
engine_key
);
auto
*
scope
=
param_scope
();
std
::
unordered_set
<
std
::
string
>
param_set
(
params
.
begin
(),
params
.
end
());
framework
::
BlockDesc
block_desc_temp
(
nullptr
,
block_desc
.
Proto
());
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
>::
Global
()
.
ConvertBlockToAnakinEngine
(
&
block_desc_temp
,
*
scope
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()),
param_set
,
output_mapping
,
anakin_engine
);
}
std
::
vector
<
std
::
string
>
ExtractAnakinParameters
(
...
...
@@ -246,7 +281,7 @@ std::vector<std::string> ExtractAnakinParameters(
for
(
const
auto
&
node
:
nodes
)
{
if
(
!
node
->
IsOp
())
continue
;
std
::
string
op_type
=
node
->
Op
()
->
Type
();
if
(
op_type
==
"feed"
)
{
if
(
op_type
==
"feed"
||
op_type
==
"fetch"
)
{
std
::
vector
<
std
::
string
>
output_names
=
node
->
Op
()
->
OutputArgumentNames
();
std
::
copy
(
output_names
.
begin
(),
output_names
.
end
(),
std
::
back_inserter
(
feed_outputs
));
...
...
paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
浏览文件 @
69d37f81
...
...
@@ -15,8 +15,13 @@
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/anakin/engine.h"
using
anakin
::
Precision
;
using
anakin
::
saber
::
NV
;
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
...
...
@@ -27,8 +32,9 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
override
;
private:
void
CreateAnakinOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
)
const
;
void
CreateAnakinOp
(
framework
::
ir
::
Node
*
x
,
framework
::
ir
::
Graph
*
graph
,
const
std
::
vector
<
std
::
string
>
&
graph_params
,
std
::
vector
<
std
::
string
>
*
repetitive_params
)
const
;
void
CleanIntermediateOutputs
(
framework
::
ir
::
Node
*
node
);
};
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
69d37f81
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include <algorithm>
#include <map>
#include <set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...
...
@@ -219,7 +220,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
std
::
string
(
""
));
bool
load_from_memory
=
Get
<
bool
>
(
"model_from_memory"
);
std
::
string
trt_engine_serialized_data
=
""
;
if
(
load_from_memory
)
{
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
=
Get
<
std
::
map
<
std
::
string
,
std
::
string
>>
(
"engine_opt_info"
);
if
(
engine_opt_info
.
count
(
engine_key
))
{
trt_engine_serialized_data
=
engine_opt_info
[
engine_key
];
}
}
SetAttr
(
op_desc
->
Proto
(),
"engine_serialized_data"
,
trt_engine_serialized_data
);
std
::
unique_ptr
<
tensorrt
::
TRTInt8Calibrator
>
calibrator
;
if
(
enable_int8
&&
calibration_data
.
size
()
!=
0
)
{
...
...
@@ -230,10 +241,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool
calibration_mode
=
(
enable_int8
&&
calibration_data
.
size
()
==
0
);
if
(
!
calibration_mode
&&
use_static_engine
)
{
if
(
!
calibration_mode
&&
use_static_engine
&&
trt_engine_serialized_data
.
empty
())
{
std
::
copy
(
params
.
begin
(),
params
.
end
(),
std
::
back_inserter
(
*
repetitive_params
));
std
::
string
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
);
if
(
trt_engine_serialized_data
.
empty
())
{
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
69d37f81
...
...
@@ -64,8 +64,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
anakin_target
(
inference_anakin_api
)
anakin_target
(
inference_anakin_api_shared
)
endif
()
if
(
WITH_ANAKIN_SUBGRAPH
)
inference_analysis_test
(
test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid
)
endif
()
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
69d37f81
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/platform/gpu_info.h"
namespace
paddle
{
extern
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
;
PassStrategy
*
AnalysisConfig
::
pass_builder
()
const
{
if
(
!
pass_builder_
.
get
())
{
...
...
@@ -230,6 +231,20 @@ void AnalysisConfig::Update() {
}
}
if
(
use_anakin_
)
{
PADDLE_ENFORCE
(
!
use_tensorrt_
,
"Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!"
);
PADDLE_ENFORCE
(
use_gpu_
,
"Anakin sub-graph engine need gpu, please use the EnableGpu API."
);
pass_builder
()
->
ClearPasses
();
for
(
const
auto
&
pass
:
kAnakinSubgraphPasses
)
{
pass_builder
()
->
AppendPass
(
pass
);
}
}
if
(
ir_debug_
)
{
pass_builder
()
->
TurnOnDebug
();
}
...
...
@@ -266,7 +281,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss
<<
specify_input_name_
;
ss
<<
cpu_math_library_num_threads_
;
ss
<<
use_anakin_
;
return
ss
.
str
();
}
...
...
@@ -316,6 +331,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update
();
}
void
AnalysisConfig
::
SetEngineOptInfo
(
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
)
{
engine_opt_info_
=
engine_opt_info
;
}
NativeConfig
AnalysisConfig
::
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
...
...
@@ -332,5 +352,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_
=
x
;
Update
();
}
void
AnalysisConfig
::
EnableAnakinEngine
()
{
use_anakin_
=
true
;
Update
();
}
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
69d37f81
...
...
@@ -351,7 +351,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetStaticMemoryOptimForceUpdate
(
config_
.
static_memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
argument_
.
SetEngineOptInfo
(
config_
.
engine_opt_info_
);
// Analyze inference_program
argument_
.
SetUseAnakin
(
config_
.
anakin_engine_enabled
());
argument_
.
SetPredictorID
(
predictor_id_
);
if
(
!
config_
.
model_dir
().
empty
())
{
argument_
.
SetModelDir
(
config_
.
model_dir
());
}
else
{
...
...
@@ -375,6 +378,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetTensorRtUseStaticEngine
(
config_
.
trt_use_static_engine_
);
}
if
(
config_
.
use_gpu
()
&&
config_
.
anakin_engine_enabled
())
{
LOG
(
INFO
)
<<
"Anakin subgraph engine is enabled"
;
}
if
(
config_
.
use_mkldnn_
)
{
LOG
(
INFO
)
<<
"MKLDNN is enabled"
;
argument_
.
SetMKLDNNEnabledOpTypes
(
config_
.
mkldnn_enabled_op_types_
);
...
...
@@ -404,7 +411,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
())
{
// 1. GPU memory
PADDLE_ENFORCE_G
T
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_G
E
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_GE
(
config
.
gpu_device_id
(),
0
,
"Invalid device id %d"
,
config
.
gpu_device_id
());
std
::
vector
<
std
::
string
>
flags
;
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
69d37f81
...
...
@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
*/
class
AnalysisPredictor
:
public
PaddlePredictor
{
public:
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{}
explicit
AnalysisPredictor
(
const
AnalysisConfig
&
config
)
:
config_
(
config
)
{
predictor_id_
=
inference
::
GetUniqueId
();
}
~
AnalysisPredictor
();
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
...
...
@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
const
size_t
max_shape_collect_count_
{
1000
};
int
need_collect_var_shapes_
{
-
1
};
// -1 for default, 0 for false, 1 for true.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batch_var_shapes_
;
int
predictor_id_
;
private:
// Some status here that help to determine the status inside the predictor.
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
69d37f81
...
...
@@ -14,9 +14,11 @@
#pragma once
#include <cassert>
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
/*! \file */
...
...
@@ -140,6 +142,14 @@ struct AnalysisConfig {
/** A boolean state telling whether the TensorRT engine is used.
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
/**
* \brief Turn on the usage of Anakin sub-graph engine.
*/
void
EnableAnakinEngine
();
/** A boolean state indicating whether the Anakin sub-graph engine is used.
*/
bool
anakin_engine_enabled
()
const
{
return
use_anakin_
;
}
/** \brief Control whether to debug IR graph analysis phase.
*
...
...
@@ -185,6 +195,7 @@ struct AnalysisConfig {
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
void
SetEngineOptInfo
(
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info
);
/** Turn on memory optimize
* NOTE still in development, will release latter.
...
...
@@ -258,6 +269,8 @@ struct AnalysisConfig {
std
::
string
serialized_info_cache_
;
mutable
std
::
unique_ptr
<
PassStrategy
>
pass_builder_
;
bool
use_anakin_
{
false
};
std
::
map
<
std
::
string
,
std
::
string
>
engine_opt_info_
;
};
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
69d37f81
...
...
@@ -68,6 +68,17 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
}
// The following passes works for Anakin sub-graph engine.
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
({
"infer_clean_graph_pass"
,
//
"simplify_anakin_detection_pattern_pass3"
,
//
"fc_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
"anakin_subgraph_pass"
,
});
GpuPassStrategy
::
GpuPassStrategy
()
:
PassStrategy
({})
{
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
...
...
@@ -120,4 +131,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
});
use_gpu_
=
false
;
}
void
PaddlePassBuilder
::
ClearPasses
()
{
passes_
.
clear
();
}
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
69d37f81
...
...
@@ -45,6 +45,7 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
void
ClearPasses
();
/** Append an analysis pass. */
void
AppendAnalysisPass
(
const
std
::
string
&
pass
);
...
...
@@ -142,4 +143,6 @@ class GpuPassStrategy : public PassStrategy {
virtual
~
GpuPassStrategy
()
=
default
;
};
extern
const
std
::
vector
<
std
::
string
>
kAnakinSubgraphPasses
;
}
// namespace paddle
paddle/fluid/operators/anakin/anakin_engine_op.h
浏览文件 @
69d37f81
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <fstream>
#include <map>
#include <memory>
#include <string>
...
...
@@ -52,8 +53,9 @@ class AnakinEngineOp : public framework::OperatorBase {
private:
std
::
vector
<
std
::
string
>
input_names_
;
std
::
unordered_set
<
std
::
string
>
param_names_
;
mutable
std
::
unique_ptr
<
AnakinNvEngineT
>
anakin_engine_
;
mutable
AnakinNvEngineT
*
anakin_engine_
;
std
::
string
engine_key_
;
std
::
string
engine_serialized_data_
;
public:
AnakinEngineOp
(
const
std
::
string
&
type
,
...
...
@@ -67,6 +69,7 @@ class AnakinEngineOp : public framework::OperatorBase {
for
(
const
auto
&
param
:
params
)
{
param_names_
.
insert
(
param
);
}
anakin_engine_
=
nullptr
;
}
protected:
...
...
@@ -77,12 +80,12 @@ class AnakinEngineOp : public framework::OperatorBase {
void
RunAnakin
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
anakin_engine_
.
get
()
==
nullptr
)
{
anakin_engine_
.
reset
(
new
AnakinEngine
<
NV
,
Precision
::
FP32
>
(
true
));
Prepare
(
scope
,
dev_place
,
anakin_engine_
.
get
());
}
auto
*
engine
=
GetEngine
(
scope
,
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
auto
*
engine
=
anakin_engine_
.
get
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
...
...
@@ -95,24 +98,48 @@ class AnakinEngineOp : public framework::OperatorBase {
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
*
anakin_input
=
engine
->
Net
()
->
get_in
(
x
);
auto
net_shape
=
anakin_input
->
shape
();
size_t
anakin_net_input_size
=
net_shape
.
count
()
*
sizeof
(
float
);
size_t
fluid_input_size
=
t
.
memory_size
();
if
(
fluid_input_size
<
anakin_net_input_size
)
{
framework
::
LoDTensor
temp_t
;
auto
t_dims
=
t
.
dims
();
temp_t
.
Resize
(
t_dims
);
TensorCopySync
(
t
,
dev_place
,
&
temp_t
);
t
.
Resize
(
framework
::
make_ddim
(
net_shape
));
t
.
mutable_data
<
float
>
(
dev_place
);
TensorCopySync
(
temp_t
,
dev_place
,
&
t
);
}
inputs
.
insert
({
x
,
&
t
});
}
std
::
map
<
std
::
string
,
framework
::
LoDTensor
*>
outputs
;
int
output_index
=
0
;
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
std
::
vector
<
int
>
ddim
=
engine
->
Net
()
->
get_out
(
output_maps
[
output_index
])
->
valid_shape
();
//
std::vector<int> ddim =
//
engine->Net()->get_out(output_maps[output_index])->valid_shape();
// we need get the output anakin output shape.
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
fluid_t
->
mutable_data
<
float
>
(
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
));
//
fluid_t->Resize(framework::make_ddim(ddim));
//
fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
outputs
.
insert
({
output_maps
[
output_index
],
fluid_t
});
output_index
+=
1
;
}
engine
->
Execute
(
inputs
,
outputs
);
engine
->
Execute
(
inputs
,
outputs
,
stream
);
}
AnakinNvEngineT
*
GetEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
if
(
anakin_engine_
==
nullptr
)
{
anakin_engine_
=
inference
::
Singleton
<
inference
::
anakin
::
AnakinEngineManager
>::
Global
()
.
Get
(
engine_key_
);
}
return
anakin_engine_
;
}
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
...
...
@@ -128,8 +155,6 @@ class AnakinEngineOp : public framework::OperatorBase {
inference
::
Singleton
<
inference
::
anakin
::
AnakinOpConverter
>::
Global
()
.
ConvertBlock
(
block_desc
,
param_names_
,
scope
,
engine
);
engine
->
Freeze
();
engine
->
Optimize
();
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
auto
&
t
=
...
...
@@ -142,6 +167,9 @@ class AnakinEngineOp : public framework::OperatorBase {
}
engine
->
SetInputShape
(
x
,
t_shape
);
}
engine
->
Optimize
();
engine
->
InitGraph
();
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录