Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
6da043eb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6da043eb
编写于
4月 16, 2021
作者:
C
ceci3
提交者:
GitHub
4月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support ernie trt-int8 for inference (#32232)
* support ernie trt-int8 for inference * fix reshape
上级
fabdb43c
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
167 addition
and
19 deletion
+167
-19
paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
...uid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+5
-0
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+32
-0
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+4
-0
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
...fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+5
-1
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+51
-12
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
...e/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+44
-5
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+5
-0
paddle/fluid/inference/tensorrt/convert/slice_op.cc
paddle/fluid/inference/tensorrt/convert/slice_op.cc
+6
-0
paddle/fluid/inference/tensorrt/convert/stack_op.cc
paddle/fluid/inference/tensorrt/convert/stack_op.cc
+5
-0
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+6
-0
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
...ddle/fluid/contrib/slim/quantization/quantization_pass.py
+4
-1
未找到文件。
paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
new_op_desc
.
SetOutput
(
"Out"
,
{
end_pattern_out
[
k
]
->
Name
()});
new_op_desc
.
SetAttr
(
"epsilon"
,
end_patter_layernorms
[
k
]
->
Op
()
->
GetAttr
(
"epsilon"
));
if
(
end_patter_layernorms
[
k
]
->
Op
()
->
HasAttr
(
"out_threshold"
))
{
new_op_desc
.
SetAttr
(
"enable_int8"
,
true
);
}
auto
*
embedding_eltwise_layernorm
=
graph
->
CreateOpNode
(
&
new_op_desc
);
for
(
size_t
iter
=
0
;
iter
<
start_pattern_in_nodes
[
i
].
size
();
++
iter
)
{
...
...
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
multihead_op_desc
.
SetAttr
(
"alpha"
,
scale_attr
);
multihead_op_desc
.
SetAttr
(
"head_number"
,
head_number
);
auto
*
mul0_op_desc
=
mul0
->
Op
();
auto
*
mul1_op_desc
=
mul1
->
Op
();
auto
*
mul2_op_desc
=
mul2
->
Op
();
if
(
mul0_op_desc
->
HasAttr
(
"enable_int8"
))
{
multihead_op_desc
.
SetAttr
(
"enable_int8"
,
mul0_op_desc
->
GetAttr
(
"enable_int8"
));
// all mul op has same input.
multihead_op_desc
.
SetAttr
(
"Input_scale"
,
mul0_op_desc
->
GetAttr
(
"X_scale"
));
auto
weight_scale0
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul0_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_scale1
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul1_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_scale2
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul2_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_max
=
std
::
max
(
weight_scale0
,
weight_scale1
);
weight_max
=
std
::
max
(
weight_max
,
weight_scale2
);
multihead_op_desc
.
SetAttr
(
"weight_scale"
,
weight_max
);
if
(
mul0_op_desc
->
HasAttr
(
"out_threshold"
))
{
auto
out_scale0
=
BOOST_GET_CONST
(
float
,
mul0_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale1
=
BOOST_GET_CONST
(
float
,
mul1_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale2
=
BOOST_GET_CONST
(
float
,
mul2_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale_max
=
std
::
max
(
out_scale0
,
out_scale1
);
out_scale_max
=
std
::
max
(
out_scale_max
,
out_scale2
);
multihead_op_desc
.
SetAttr
(
"out_threshold"
,
out_scale_max
);
}
}
auto
*
multihead
=
graph
->
CreateOpNode
(
&
multihead_op_desc
);
IR_NODE_LINK_TO
(
input0
,
multihead
);
...
...
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
new_desc
.
SetInput
(
"Scale"
,
{
layer_norm_scale
->
Name
()});
new_desc
.
SetInput
(
"Bias"
,
{
layer_norm_bias
->
Name
()});
if
(
elementwise
->
Op
()
->
HasAttr
(
"out_threshold"
))
{
new_desc
.
SetAttr
(
"enable_int8"
,
true
);
}
// outputs
new_desc
.
SetOutput
(
"Out"
,
{
layer_norm_out
->
Name
()});
...
...
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
浏览文件 @
6da043eb
...
...
@@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
#if IS_TRT_VERSION_GE(6000)
VLOG
(
4
)
<<
"convert fluid
swish
op to tensorrt layer"
;
VLOG
(
4
)
<<
"convert fluid
EmbEltwiseLayerNorm
op to tensorrt layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
id_names
=
op_desc
.
Input
(
"Ids"
);
...
...
@@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
int64_t
bias_size
=
framework
::
product
(
bias_dims
);
int64_t
scale_size
=
framework
::
product
(
scale_dims
);
nvinfer1
::
ILayer
*
layer
=
nullptr
;
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
if
(
engine_
->
with_dynamic_shape
())
{
if
(
engine_
->
use_oss
())
{
int
output_fp16
=
static_cast
<
int
>
((
engine_
->
WithFp16
()
==
1
)
?
1
:
0
);
if
(
enable_int8
)
{
output_fp16
=
1
;
}
PADDLE_ENFORCE_EQ
(
output_fp16
,
1
,
platform
::
errors
::
InvalidArgument
(
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
6da043eb
...
...
@@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter {
auto
regist_fc
=
[
&
](
nvinfer1
::
ITensor
*
inputs
,
int
n_output
,
TensorRTEngine
::
Weight
&
weight
,
TensorRTEngine
::
Weight
&
bias
)
{
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
inputs
,
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"out_threshold"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have out threshold in fc layers in int8 mode"
));
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
inputs
,
n_output
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
}
else
{
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
inputs
,
n_output
,
weight
.
get
(),
bias
.
get
());
}
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
if
(
activation_type
==
"relu"
)
{
...
...
@@ -229,6 +243,15 @@ class FcOpConverter : public OpConverter {
"dims equals to 4, the last dim of input must be 1, but got %d"
,
input_d
[
3
]));
}
if
(
enable_int8
)
{
reshape_dim3
[
0
]
=
1
;
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
reshape_dim3
[
0
]
*=
input_d
[
i
];
if
(
i
>
0
)
{
reshape_dim3
[
i
]
=
1
;
}
}
}
else
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
if
(
i
<
input_dims
)
{
reshape_dim3
[
i
]
=
input_d
[
i
];
...
...
@@ -236,6 +259,8 @@ class FcOpConverter : public OpConverter {
reshape_dim3
[
i
]
=
1
;
}
}
}
nvinfer1
::
Dims3
reshape_dim
(
reshape_dim3
[
0
],
reshape_dim3
[
1
],
reshape_dim3
[
2
]);
auto
*
reshape_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
X
);
...
...
@@ -249,6 +274,19 @@ class FcOpConverter : public OpConverter {
platform
::
errors
::
InvalidArgument
(
"Invalid dimensions. When x_num_col_dims equals to "
"2, input_dims should not be 1"
));
if
(
enable_int8
)
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
i
==
0
)
{
reshape_dim4
[
i
]
=
input_d
[
i
];
}
else
{
reshape_dim4
[
i
]
=
1
;
if
(
i
<
input_dims
)
{
reshape_dim4
[
1
]
*=
input_d
[
i
];
}
}
}
}
else
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
i
<
input_dims
)
{
reshape_dim4
[
i
]
=
input_d
[
i
];
...
...
@@ -256,6 +294,7 @@ class FcOpConverter : public OpConverter {
reshape_dim4
[
i
]
=
1
;
}
}
}
nvinfer1
::
Dims4
reshape_dim
(
reshape_dim4
[
0
],
reshape_dim4
[
1
],
reshape_dim4
[
2
],
reshape_dim4
[
3
]);
auto
*
reshape_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
X
);
...
...
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
浏览文件 @
6da043eb
...
...
@@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
auto
*
bias_v
=
scope
.
FindVar
(
bias_name
);
auto
*
bias_t
=
bias_v
->
GetMutable
<
framework
::
LoDTensor
>
();
float
*
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
false
);
float
*
weight_data
=
nullptr
;
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
float
in_scale
=
0.
;
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"Input_scale"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have input scale in multihead layers in int8 mode"
));
in_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"Input_scale"
))
*
127
;
auto
weight_scale
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
op_desc
.
GetAttr
(
"weight_scale"
));
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
true
,
weight_scale
);
engine_
->
SetTensorDynamicRange
(
input
,
in_scale
);
}
else
{
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
false
);
}
float
*
bias_data
=
engine_
->
GetWeightCPUData
(
bias_name
,
bias_t
,
false
);
std
::
vector
<
float
>
weight_data_tmp
;
weight_data_tmp
.
reserve
(
weight_t
->
numel
());
...
...
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
static_cast
<
void
*>
(
bias_data
),
static_cast
<
int32_t
>
(
bias_t
->
numel
())};
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
input
,
n
,
weight
,
bias
);
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
float
dp_probs
=
1.0
/
127.0
;
if
(
enable_int8
)
{
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
input
,
n
,
nv_ksize
,
weight
,
bias
);
}
else
{
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
input
,
n
,
weight
,
bias
);
}
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"out_threshold"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have out threshold in multihead layers in int8 mode"
));
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
dp_probs
=
out_scale
/
127.0
;
}
auto
mask_tensor
=
engine_
->
GetITensor
(
"qkv_plugin_mask"
);
...
...
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
int
type
=
static_cast
<
int
>
((
engine_
->
WithFp16
()
==
1
)
?
nvinfer1
::
DataType
::
kHALF
:
nvinfer1
::
DataType
::
kFLOAT
);
if
(
enable_int8
)
{
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
}
bool
has_mask
=
true
;
int
var_seqlen
=
1
;
const
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
...
...
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
{
"num_heads"
,
&
head_number
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"has_mask"
,
&
has_mask
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"var_seqlen"
,
&
var_seqlen
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
};
{
"dq_probs"
,
&
dp_probs
,
nvinfer1
::
PluginFieldType
::
kFLOAT32
,
1
}
};
nvinfer1
::
PluginFieldCollection
*
plugin_collection
=
static_cast
<
nvinfer1
::
PluginFieldCollection
*>
(
malloc
(
sizeof
(
*
plugin_collection
)
+
...
...
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
浏览文件 @
6da043eb
...
...
@@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter {
auto
*
scale
=
get_persistable_data
(
"Scale"
,
&
scale_dims
);
int
bias_size
=
framework
::
product
(
bias_dims
);
int
scale_size
=
framework
::
product
(
scale_dims
);
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
nvinfer1
::
ILayer
*
layer
=
nullptr
;
if
(
engine_
->
with_dynamic_shape
())
{
...
...
@@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter {
int
ld
=
input1
->
getDimensions
().
d
[
2
];
// hidden dimension
assert
(
ld
>
0
);
if
(
enable_int8
)
{
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
}
const
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
{
"type_id"
,
&
type
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"ld"
,
&
ld
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
...
...
paddle/fluid/inference/tensorrt/convert/slice_op.cc
浏览文件 @
6da043eb
...
...
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
// Declare inputs
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
)[
0
]);
if
(
op_desc
.
HasAttr
(
"out_threshold"
))
{
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
input
,
out_scale
);
}
std
::
vector
<
int
>
axes
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op_desc
.
GetAttr
(
"axes"
));
std
::
vector
<
int
>
starts
=
...
...
paddle/fluid/inference/tensorrt/convert/stack_op.cc
浏览文件 @
6da043eb
...
...
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
inputs
[
i
]
=
engine_
->
GetITensor
(
input
[
i
]);
if
(
op_desc
.
HasAttr
(
"out_threshold"
))
{
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
inputs
[
i
],
out_scale
);
}
}
int
axis
=
BOOST_GET_CONST
(
int
,
op_desc
.
GetAttr
(
"axis"
));
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
浏览文件 @
6da043eb
...
...
@@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller {
#endif
#if IS_TRT_VERSION_GE(7130)
teller_set
.
insert
(
"group_norm"
);
int8_teller_set
.
insert
(
"multihead_matmul"
);
int8_teller_set
.
insert
(
"skip_layernorm"
);
int8_teller_set
.
insert
(
"fused_embedding_eltwise_layernorm"
);
int8_teller_set
.
insert
(
"matmul"
);
int8_teller_set
.
insert
(
"stack"
);
int8_teller_set
.
insert
(
"slice"
);
#endif
}
...
...
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
浏览文件 @
6da043eb
...
...
@@ -60,6 +60,7 @@ _out_scale_op_list = [
"swish"
,
"softmax"
,
"batch_norm"
,
"layer_norm"
,
"elementwise_add"
,
"pool2d"
,
"reshape2"
,
...
...
@@ -67,6 +68,7 @@ _out_scale_op_list = [
"concat"
,
"elementwise_mul"
,
"scale"
,
"slice"
,
"hard_swish"
,
"hard_sigmoid"
,
"conv2d_transpose"
,
...
...
@@ -119,6 +121,7 @@ _op_real_in_out_name = {
"swish"
:
[[
"X"
],
[
"Out"
]],
"dropout"
:
[[
"X"
],
[
"Out"
]],
"batch_norm"
:
[[
"X"
],
[
"Y"
]],
"layer_norm"
:
[[
"X"
],
[
"Y"
]],
"sigmoid"
:
[[
"X"
],
[
"Out"
]],
"elementwise_mul"
:
[[
"X"
,
"Y"
],
[
"Out"
]],
"scale"
:
[[
"X"
],
[
"Out"
]],
...
...
@@ -1749,7 +1752,7 @@ class AddQuantDequantPass(object):
"bilinear_interp"
,
"nearest_interp"
,
"trilinear_interp"
,
"slice"
,
"squeeze"
,
"elementwise_sub"
,
"mul"
,
"matmul"
,
"relu"
,
"relu6"
,
"leaky_relu"
,
"tanh"
,
"swish"
,
"scale"
,
"transpose"
,
"transpose2"
,
"sigmoid"
,
"pad2d"
,
"flatten"
,
"flatten2"
,
"batch_norm"
"sigmoid"
,
"pad2d"
,
"flatten"
,
"flatten2"
,
"batch_norm"
,
"layer_norm"
]
# To be compatible with PaddleSlim, not remove _activation_type for now
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录