Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
6da043eb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6da043eb
编写于
4月 16, 2021
作者:
C
ceci3
提交者:
GitHub
4月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support ernie trt-int8 for inference (#32232)
* support ernie trt-int8 for inference * fix reshape
上级
fabdb43c
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
167 addition
and
19 deletion
+167
-19
paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
...uid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+5
-0
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+32
-0
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+4
-0
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
...fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+5
-1
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+51
-12
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
...e/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+44
-5
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+5
-0
paddle/fluid/inference/tensorrt/convert/slice_op.cc
paddle/fluid/inference/tensorrt/convert/slice_op.cc
+6
-0
paddle/fluid/inference/tensorrt/convert/stack_op.cc
paddle/fluid/inference/tensorrt/convert/stack_op.cc
+5
-0
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+6
-0
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
...ddle/fluid/contrib/slim/quantization/quantization_pass.py
+4
-1
未找到文件。
paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
new_op_desc
.
SetOutput
(
"Out"
,
{
end_pattern_out
[
k
]
->
Name
()});
new_op_desc
.
SetAttr
(
"epsilon"
,
end_patter_layernorms
[
k
]
->
Op
()
->
GetAttr
(
"epsilon"
));
if
(
end_patter_layernorms
[
k
]
->
Op
()
->
HasAttr
(
"out_threshold"
))
{
new_op_desc
.
SetAttr
(
"enable_int8"
,
true
);
}
auto
*
embedding_eltwise_layernorm
=
graph
->
CreateOpNode
(
&
new_op_desc
);
for
(
size_t
iter
=
0
;
iter
<
start_pattern_in_nodes
[
i
].
size
();
++
iter
)
{
...
...
paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
multihead_op_desc
.
SetAttr
(
"alpha"
,
scale_attr
);
multihead_op_desc
.
SetAttr
(
"head_number"
,
head_number
);
auto
*
mul0_op_desc
=
mul0
->
Op
();
auto
*
mul1_op_desc
=
mul1
->
Op
();
auto
*
mul2_op_desc
=
mul2
->
Op
();
if
(
mul0_op_desc
->
HasAttr
(
"enable_int8"
))
{
multihead_op_desc
.
SetAttr
(
"enable_int8"
,
mul0_op_desc
->
GetAttr
(
"enable_int8"
));
// all mul op has same input.
multihead_op_desc
.
SetAttr
(
"Input_scale"
,
mul0_op_desc
->
GetAttr
(
"X_scale"
));
auto
weight_scale0
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul0_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_scale1
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul1_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_scale2
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
mul2_op_desc
->
GetAttr
(
"weight_scale"
));
auto
weight_max
=
std
::
max
(
weight_scale0
,
weight_scale1
);
weight_max
=
std
::
max
(
weight_max
,
weight_scale2
);
multihead_op_desc
.
SetAttr
(
"weight_scale"
,
weight_max
);
if
(
mul0_op_desc
->
HasAttr
(
"out_threshold"
))
{
auto
out_scale0
=
BOOST_GET_CONST
(
float
,
mul0_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale1
=
BOOST_GET_CONST
(
float
,
mul1_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale2
=
BOOST_GET_CONST
(
float
,
mul2_op_desc
->
GetAttr
(
"out_threshold"
));
auto
out_scale_max
=
std
::
max
(
out_scale0
,
out_scale1
);
out_scale_max
=
std
::
max
(
out_scale_max
,
out_scale2
);
multihead_op_desc
.
SetAttr
(
"out_threshold"
,
out_scale_max
);
}
}
auto
*
multihead
=
graph
->
CreateOpNode
(
&
multihead_op_desc
);
IR_NODE_LINK_TO
(
input0
,
multihead
);
...
...
paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
浏览文件 @
6da043eb
...
...
@@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
new_desc
.
SetInput
(
"Scale"
,
{
layer_norm_scale
->
Name
()});
new_desc
.
SetInput
(
"Bias"
,
{
layer_norm_bias
->
Name
()});
if
(
elementwise
->
Op
()
->
HasAttr
(
"out_threshold"
))
{
new_desc
.
SetAttr
(
"enable_int8"
,
true
);
}
// outputs
new_desc
.
SetOutput
(
"Out"
,
{
layer_norm_out
->
Name
()});
...
...
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
浏览文件 @
6da043eb
...
...
@@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
#if IS_TRT_VERSION_GE(6000)
VLOG
(
4
)
<<
"convert fluid
swish
op to tensorrt layer"
;
VLOG
(
4
)
<<
"convert fluid
EmbEltwiseLayerNorm
op to tensorrt layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
auto
id_names
=
op_desc
.
Input
(
"Ids"
);
...
...
@@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
int64_t
bias_size
=
framework
::
product
(
bias_dims
);
int64_t
scale_size
=
framework
::
product
(
scale_dims
);
nvinfer1
::
ILayer
*
layer
=
nullptr
;
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
if
(
engine_
->
with_dynamic_shape
())
{
if
(
engine_
->
use_oss
())
{
int
output_fp16
=
static_cast
<
int
>
((
engine_
->
WithFp16
()
==
1
)
?
1
:
0
);
if
(
enable_int8
)
{
output_fp16
=
1
;
}
PADDLE_ENFORCE_EQ
(
output_fp16
,
1
,
platform
::
errors
::
InvalidArgument
(
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
6da043eb
...
...
@@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter {
auto
regist_fc
=
[
&
](
nvinfer1
::
ITensor
*
inputs
,
int
n_output
,
TensorRTEngine
::
Weight
&
weight
,
TensorRTEngine
::
Weight
&
bias
)
{
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
inputs
,
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"out_threshold"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have out threshold in fc layers in int8 mode"
));
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
inputs
,
n_output
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
}
else
{
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
inputs
,
n_output
,
weight
.
get
(),
bias
.
get
());
}
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
if
(
activation_type
==
"relu"
)
{
...
...
@@ -229,6 +243,15 @@ class FcOpConverter : public OpConverter {
"dims equals to 4, the last dim of input must be 1, but got %d"
,
input_d
[
3
]));
}
if
(
enable_int8
)
{
reshape_dim3
[
0
]
=
1
;
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
reshape_dim3
[
0
]
*=
input_d
[
i
];
if
(
i
>
0
)
{
reshape_dim3
[
i
]
=
1
;
}
}
}
else
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
if
(
i
<
input_dims
)
{
reshape_dim3
[
i
]
=
input_d
[
i
];
...
...
@@ -236,6 +259,8 @@ class FcOpConverter : public OpConverter {
reshape_dim3
[
i
]
=
1
;
}
}
}
nvinfer1
::
Dims3
reshape_dim
(
reshape_dim3
[
0
],
reshape_dim3
[
1
],
reshape_dim3
[
2
]);
auto
*
reshape_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
X
);
...
...
@@ -249,6 +274,19 @@ class FcOpConverter : public OpConverter {
platform
::
errors
::
InvalidArgument
(
"Invalid dimensions. When x_num_col_dims equals to "
"2, input_dims should not be 1"
));
if
(
enable_int8
)
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
i
==
0
)
{
reshape_dim4
[
i
]
=
input_d
[
i
];
}
else
{
reshape_dim4
[
i
]
=
1
;
if
(
i
<
input_dims
)
{
reshape_dim4
[
1
]
*=
input_d
[
i
];
}
}
}
}
else
{
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
i
<
input_dims
)
{
reshape_dim4
[
i
]
=
input_d
[
i
];
...
...
@@ -256,6 +294,7 @@ class FcOpConverter : public OpConverter {
reshape_dim4
[
i
]
=
1
;
}
}
}
nvinfer1
::
Dims4
reshape_dim
(
reshape_dim4
[
0
],
reshape_dim4
[
1
],
reshape_dim4
[
2
],
reshape_dim4
[
3
]);
auto
*
reshape_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
X
);
...
...
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
浏览文件 @
6da043eb
...
...
@@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
auto
*
bias_v
=
scope
.
FindVar
(
bias_name
);
auto
*
bias_t
=
bias_v
->
GetMutable
<
framework
::
LoDTensor
>
();
float
*
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
false
);
float
*
weight_data
=
nullptr
;
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
float
in_scale
=
0.
;
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"Input_scale"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have input scale in multihead layers in int8 mode"
));
in_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"Input_scale"
))
*
127
;
auto
weight_scale
=
BOOST_GET_CONST
(
std
::
vector
<
float
>
,
op_desc
.
GetAttr
(
"weight_scale"
));
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
true
,
weight_scale
);
engine_
->
SetTensorDynamicRange
(
input
,
in_scale
);
}
else
{
weight_data
=
engine_
->
GetWeightCPUData
(
weight_name
,
weight_t
,
false
);
}
float
*
bias_data
=
engine_
->
GetWeightCPUData
(
bias_name
,
bias_t
,
false
);
std
::
vector
<
float
>
weight_data_tmp
;
weight_data_tmp
.
reserve
(
weight_t
->
numel
());
...
...
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
static_cast
<
void
*>
(
bias_data
),
static_cast
<
int32_t
>
(
bias_t
->
numel
())};
auto
*
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
input
,
n
,
weight
,
bias
);
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
float
dp_probs
=
1.0
/
127.0
;
if
(
enable_int8
)
{
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
input
,
n
,
nv_ksize
,
weight
,
bias
);
}
else
{
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
input
,
n
,
weight
,
bias
);
}
if
(
enable_int8
)
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"out_threshold"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have out threshold in multihead layers in int8 mode"
));
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
dp_probs
=
out_scale
/
127.0
;
}
auto
mask_tensor
=
engine_
->
GetITensor
(
"qkv_plugin_mask"
);
...
...
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
int
type
=
static_cast
<
int
>
((
engine_
->
WithFp16
()
==
1
)
?
nvinfer1
::
DataType
::
kHALF
:
nvinfer1
::
DataType
::
kFLOAT
);
if
(
enable_int8
)
{
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
}
bool
has_mask
=
true
;
int
var_seqlen
=
1
;
const
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
...
...
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
{
"num_heads"
,
&
head_number
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"has_mask"
,
&
has_mask
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"var_seqlen"
,
&
var_seqlen
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
};
{
"dq_probs"
,
&
dp_probs
,
nvinfer1
::
PluginFieldType
::
kFLOAT32
,
1
}
};
nvinfer1
::
PluginFieldCollection
*
plugin_collection
=
static_cast
<
nvinfer1
::
PluginFieldCollection
*>
(
malloc
(
sizeof
(
*
plugin_collection
)
+
...
...
paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
浏览文件 @
6da043eb
...
...
@@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter {
auto
*
scale
=
get_persistable_data
(
"Scale"
,
&
scale_dims
);
int
bias_size
=
framework
::
product
(
bias_dims
);
int
scale_size
=
framework
::
product
(
scale_dims
);
bool
enable_int8
=
op_desc
.
HasAttr
(
"enable_int8"
);
nvinfer1
::
ILayer
*
layer
=
nullptr
;
if
(
engine_
->
with_dynamic_shape
())
{
...
...
@@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter {
int
ld
=
input1
->
getDimensions
().
d
[
2
];
// hidden dimension
assert
(
ld
>
0
);
if
(
enable_int8
)
{
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
}
const
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
{
"type_id"
,
&
type
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"ld"
,
&
ld
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
...
...
paddle/fluid/inference/tensorrt/convert/slice_op.cc
浏览文件 @
6da043eb
...
...
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
// Declare inputs
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
)[
0
]);
if
(
op_desc
.
HasAttr
(
"out_threshold"
))
{
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
input
,
out_scale
);
}
std
::
vector
<
int
>
axes
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op_desc
.
GetAttr
(
"axes"
));
std
::
vector
<
int
>
starts
=
...
...
paddle/fluid/inference/tensorrt/convert/stack_op.cc
浏览文件 @
6da043eb
...
...
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
inputs
[
i
]
=
engine_
->
GetITensor
(
input
[
i
]);
if
(
op_desc
.
HasAttr
(
"out_threshold"
))
{
float
out_scale
=
BOOST_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"out_threshold"
));
engine_
->
SetTensorDynamicRange
(
inputs
[
i
],
out_scale
);
}
}
int
axis
=
BOOST_GET_CONST
(
int
,
op_desc
.
GetAttr
(
"axis"
));
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
浏览文件 @
6da043eb
...
...
@@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller {
#endif
#if IS_TRT_VERSION_GE(7130)
teller_set
.
insert
(
"group_norm"
);
int8_teller_set
.
insert
(
"multihead_matmul"
);
int8_teller_set
.
insert
(
"skip_layernorm"
);
int8_teller_set
.
insert
(
"fused_embedding_eltwise_layernorm"
);
int8_teller_set
.
insert
(
"matmul"
);
int8_teller_set
.
insert
(
"stack"
);
int8_teller_set
.
insert
(
"slice"
);
#endif
}
...
...
python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
浏览文件 @
6da043eb
...
...
@@ -60,6 +60,7 @@ _out_scale_op_list = [
"swish"
,
"softmax"
,
"batch_norm"
,
"layer_norm"
,
"elementwise_add"
,
"pool2d"
,
"reshape2"
,
...
...
@@ -67,6 +68,7 @@ _out_scale_op_list = [
"concat"
,
"elementwise_mul"
,
"scale"
,
"slice"
,
"hard_swish"
,
"hard_sigmoid"
,
"conv2d_transpose"
,
...
...
@@ -119,6 +121,7 @@ _op_real_in_out_name = {
"swish"
:
[[
"X"
],
[
"Out"
]],
"dropout"
:
[[
"X"
],
[
"Out"
]],
"batch_norm"
:
[[
"X"
],
[
"Y"
]],
"layer_norm"
:
[[
"X"
],
[
"Y"
]],
"sigmoid"
:
[[
"X"
],
[
"Out"
]],
"elementwise_mul"
:
[[
"X"
,
"Y"
],
[
"Out"
]],
"scale"
:
[[
"X"
],
[
"Out"
]],
...
...
@@ -1749,7 +1752,7 @@ class AddQuantDequantPass(object):
"bilinear_interp"
,
"nearest_interp"
,
"trilinear_interp"
,
"slice"
,
"squeeze"
,
"elementwise_sub"
,
"mul"
,
"matmul"
,
"relu"
,
"relu6"
,
"leaky_relu"
,
"tanh"
,
"swish"
,
"scale"
,
"transpose"
,
"transpose2"
,
"sigmoid"
,
"pad2d"
,
"flatten"
,
"flatten2"
,
"batch_norm"
"sigmoid"
,
"pad2d"
,
"flatten"
,
"flatten2"
,
"batch_norm"
,
"layer_norm"
]
# To be compatible with PaddleSlim, not remove _activation_type for now
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录