Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
fa06d9c3
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fa06d9c3
编写于
8月 26, 2022
作者:
W
Wangzheee
提交者:
GitHub
8月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix_multihead (#45429)
上级
a5e9ccda
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
319 addition
and
310 deletion
+319
-310
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
...e/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+319
-310
未找到文件。
paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
浏览文件 @
fa06d9c3
...
@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
...
@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs
.
data
(),
plugin_inputs
.
size
(),
*
plugin
);
plugin_inputs
.
data
(),
plugin_inputs
.
size
(),
*
plugin
);
layer
=
plugin_layer
;
layer
=
plugin_layer
;
}
}
}
}
else
{
if
(
input_dims
.
d
[
1
]
<=
384
&&
!
bias_qk_attr
&&
if
(
input_dims
.
d
[
1
]
<=
384
&&
!
bias_qk_attr
&&
engine_
->
precision
()
!=
AnalysisConfig
::
Precision
::
kFloat32
)
{
engine_
->
precision
()
!=
AnalysisConfig
::
Precision
::
kFloat32
)
{
/*
/*
* input_dims.d[0]: batch(-1)
* input_dims.d[0]: batch(-1)
* input_dims.d[1]: length:256
* input_dims.d[1]: length:256
* input_dims.d[2]: hidden_size:768
* input_dims.d[2]: hidden_size:768
input
input
|[b,256,768]
|[b,256,768]
|
|
shuffle weight bias
shuffle weight bias
|[b,256,768,1,1] | |
|[b,256,768,1,1] | |
|_____________________|_________|
|_____________________|_________|
|
|
fc
fc
|[b,256,2304,1,1]
|[b,256,2304,1,1]
|
|
shuffle mask(fake) pos max_length
shuffle mask(fake) pos max_length
|[b*256,2304,1,1] | | |
|[b*256,2304,1,1] | | |
| | | |
| | | |
|_______________________|_________|________|
|_______________________|_________|________|
|
|
MHA
MHA
|[b*256,768]
|[b*256,768]
|
|
shuffle
shuffle
|[b, 256, 768]
|[b, 256, 768]
|
|
out
out
*/
*/
nvinfer1
::
Weights
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Weights
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
static_cast
<
void
*>
(
weight_data
),
static_cast
<
int32_t
>
(
weight_t
->
numel
())};
static_cast
<
int32_t
>
(
weight_t
->
numel
())};
nvinfer1
::
Weights
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Weights
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
bias_data
),
static_cast
<
void
*>
(
bias_data
),
static_cast
<
int32_t
>
(
bias_t
->
numel
())};
static_cast
<
int32_t
>
(
bias_t
->
numel
())};
/*** transpose the weight and bias ***/
/*** transpose the weight and bias ***/
int
head_size
=
hidden_out
/
head_number
;
int
head_size
=
hidden_out
/
head_number
;
// [3, head_number, head_size, hidden_in] -> [head_number, 3,
// [3, head_number, head_size, hidden_in] -> [head_number, 3,
// head_size, hidden_in]
// head_size, hidden_in]
auto
transpose_weight_v2
=
[](
const
float
*
src
,
auto
transpose_weight_v2
=
[](
const
float
*
src
,
float
*
dst
,
float
*
dst
,
int
three
,
int
three
,
int
head_number
,
int
head_number
,
int
head_size
,
int
head_size
,
int
hidden_in
)
{
int
hidden_in
)
{
const
int
HH
=
head_size
*
hidden_in
;
const
int
HH
=
head_size
*
hidden_in
;
for
(
int
i
=
0
;
i
<
three
;
++
i
)
{
for
(
int
i
=
0
;
i
<
three
;
++
i
)
{
for
(
int
n
=
0
;
n
<
head_number
;
++
n
)
{
for
(
int
n
=
0
;
n
<
head_number
;
++
n
)
{
for
(
int
hh
=
0
;
hh
<
HH
;
++
hh
)
{
for
(
int
hh
=
0
;
hh
<
HH
;
++
hh
)
{
dst
[
n
*
three
*
HH
+
i
*
HH
+
hh
]
=
dst
[
n
*
three
*
HH
+
i
*
HH
+
hh
]
=
src
[
i
*
head_number
*
HH
+
n
*
HH
+
hh
];
src
[
i
*
head_number
*
HH
+
n
*
HH
+
hh
];
}
}
}
}
}
}
}
;
};
// [3, head_number, head_size] -> [head_number, 3, head_size]
// [3, head_number, head_size] -> [head_number, 3, head_size]
auto
transpose_bias_v2
=
auto
transpose_bias_v2
=
[](
const
float
*
src
,
float
*
dst
,
int
N
,
int
H
)
{
[](
const
float
*
src
,
float
*
dst
,
int
N
,
int
H
)
{
for
(
int
i
=
0
;
i
<
3
;
++
i
)
{
for
(
int
i
=
0
;
i
<
3
;
++
i
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
dst
[
n
*
3
*
H
+
i
*
H
+
h
]
=
src
[
i
*
N
*
H
+
n
*
H
+
h
];
dst
[
n
*
3
*
H
+
i
*
H
+
h
]
=
src
[
i
*
N
*
H
+
n
*
H
+
h
];
}
}
}
}
}
}
};
};
memcpy
(
weight_data_tmp
.
data
(),
memcpy
(
weight_data_tmp
.
data
(),
weight_data
,
weight_data
,
weight_t
->
numel
()
*
sizeof
(
float
));
weight_t
->
numel
()
*
sizeof
(
float
));
transpose_weight_v2
(
weight_data_tmp
.
data
(),
transpose_weight_v2
(
weight_data_tmp
.
data
(),
weight_data
,
weight_data
,
three
,
three
,
head_number
,
head_number
,
head_size
,
head_size
,
hidden_in
);
hidden_in
);
std
::
vector
<
float
>
bias_data_tmp
;
std
::
vector
<
float
>
bias_data_tmp
;
bias_data_tmp
.
reserve
(
bias_t
->
numel
());
bias_data_tmp
.
reserve
(
bias_t
->
numel
());
memcpy
(
memcpy
(
bias_data_tmp
.
data
(),
bias_data
,
bias_t
->
numel
()
*
sizeof
(
float
));
bias_data_tmp
.
data
(),
bias_data
,
bias_t
->
numel
()
*
sizeof
(
float
));
transpose_bias_v2
(
transpose_bias_v2
(
bias_data_tmp
.
data
(),
bias_data
,
head_number
,
head_size
);
bias_data_tmp
.
data
(),
bias_data
,
head_number
,
head_size
);
// add shuffle for FullyConnected layer
// add shuffle for FullyConnected layer
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_before_fc_shape_tensor
;
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_before_fc_shape_tensor
;
nvinfer1
::
ITensor
*
input_shape_tensor
=
Shape
(
input
);
nvinfer1
::
ITensor
*
input_shape_tensor
=
Shape
(
input
);
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
reshape_before_fc_shape_tensor
.
push_back
(
Add1DConstantLayer
(
1
));
reshape_before_fc_shape_tensor
.
push_back
(
Add1DConstantLayer
(
1
));
}
}
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
reshape_before_fc_shape_tensor
[
i
]
=
reshape_before_fc_shape_tensor
[
i
]
=
GetEleTensorOfShape
(
input_shape_tensor
,
i
);
GetEleTensorOfShape
(
input_shape_tensor
,
i
);
}
}
auto
*
reshape_before_fc_layer
=
auto
*
reshape_before_fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
input
);
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
input
);
reshape_before_fc_layer
->
setInput
(
reshape_before_fc_layer
->
setInput
(
1
,
*
Concat
(
reshape_before_fc_shape_tensor
));
1
,
*
Concat
(
reshape_before_fc_shape_tensor
));
reshape_before_fc_layer
->
setName
(
reshape_before_fc_layer
->
setName
(
(
"shuffle_before_fc_multihead_matmul(Output: "
+
output_name
+
(
"shuffle_before_fc_multihead_matmul(Output: "
+
output_name
+
")"
)
")"
)
.
c_str
());
.
c_str
());
// add fc layer
// add fc layer
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
reshape_before_fc_layer
->
getOutput
(
0
),
n
,
weight
,
bias
);
// add shuffle for CustomQKVToContextPluginDynamic layer
auto
*
reshape_after_fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
fc_layer
->
getOutput
(
0
));
std
::
vector
<
nvinfer1
::
ITensor
*>
mha_input_tensor_shape
;
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
-
1
));
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
hidden_out
*
3
));
// Q,K,V
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
1
));
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
1
));
reshape_after_fc_layer
->
setInput
(
1
,
*
Concat
(
mha_input_tensor_shape
));
reshape_after_fc_layer
->
setName
(
(
"shuffle_after_fc_multihead_matmul(Output: "
+
output_name
+
")"
)
.
c_str
());
// add mha_plugin
auto
creator
=
GetPluginRegistry
()
->
getPluginCreator
(
"CustomQKVToContextPluginDynamic"
,
"2"
);
assert
(
creator
!=
nullptr
);
// set the attributes of mha_plugin
int
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
int
var_seqlen
=
1
;
bool
has_mask
=
true
;
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
{
"hidden_size"
,
&
hidden_out
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"num_heads"
,
&
head_number
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"type_id"
,
&
type
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"has_mask"
,
&
has_mask
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"var_seqlen"
,
&
var_seqlen
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
}};
nvinfer1
::
PluginFieldCollection
*
plugin_collection
=
static_cast
<
nvinfer1
::
PluginFieldCollection
*>
(
malloc
(
sizeof
(
*
plugin_collection
)
+
fields
.
size
()
*
sizeof
(
nvinfer1
::
PluginField
)));
// remember to free
plugin_collection
->
nbFields
=
static_cast
<
int
>
(
fields
.
size
());
plugin_collection
->
fields
=
fields
.
data
();
auto
plugin
=
creator
->
createPlugin
(
"CustomQKVToContextPluginDynamic"
,
plugin_collection
);
free
(
plugin_collection
);
// set inputs
std
::
vector
<
nvinfer1
::
ITensor
*>
plugin_inputs
;
// input_0 for plugin
plugin_inputs
.
emplace_back
(
reshape_after_fc_layer
->
getOutput
(
0
));
// input_1(fake) for plugin
std
::
vector
<
int
>
mask
=
{
1
};
nvinfer1
::
ITensor
*
mask_tensor
=
Add1DConstantLayer
(
mask
);
plugin_inputs
.
emplace_back
(
mask_tensor
);
// input_2 for plugin
std
::
vector
<
int
>
pos_id
=
{
0
};
int
max_batch
=
500
;
for
(
int
i
=
1
;
i
<
max_batch
;
i
++
)
{
pos_id
.
push_back
(
i
);
}
nvinfer1
::
ITensor
*
fake_pos_id_tensor
=
Add1DConstantLayer
(
pos_id
);
nvinfer1
::
ITensor
*
length_tensor
=
GetEleTensorOfShape
(
input_shape_tensor
,
1
);
auto
pos_id_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
fake_pos_id_tensor
,
*
length_tensor
,
nvinfer1
::
ElementWiseOperation
::
kPROD
);
// size = batch + 1;
nvinfer1
::
ITensor
*
batch_tensor
=
GetEleTensorOfShape
(
input_shape_tensor
,
0
);
std
::
vector
<
int
>
const_data
=
{
1
};
nvinfer1
::
ITensor
*
const_tensor
=
Add1DConstantLayer
(
const_data
);
auto
size_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
batch_tensor
,
*
const_tensor
,
nvinfer1
::
ElementWiseOperation
::
kSUM
);
// get size(batch + 1) data from pos_id_tensor
nvinfer1
::
Dims
start
;
nvinfer1
::
Dims
stride
;
nvinfer1
::
Dims
size
;
start
.
nbDims
=
1
;
stride
.
nbDims
=
1
;
size
.
nbDims
=
1
;
start
.
d
[
0
]
=
0
;
stride
.
d
[
0
]
=
1
;
size
.
d
[
0
]
=
1
;
auto
*
slice_pos_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Slice
,
*
pos_id_layer
->
getOutput
(
0
),
start
,
size
,
stride
);
slice_pos_layer
->
setInput
(
2
,
*
size_layer
->
getOutput
(
0
));
plugin_inputs
.
emplace_back
(
slice_pos_layer
->
getOutput
(
0
));
// input_3 for plugin
std
::
vector
<
int
>
data
(
500
,
1
);
nvinfer1
::
ITensor
*
fake_max_seqlen_tensor
=
Add1DConstantLayer
(
data
);
auto
*
slice_max_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Slice
,
*
fake_max_seqlen_tensor
,
start
,
size
,
stride
);
slice_max_layer
->
setInput
(
2
,
*
length_tensor
);
plugin_inputs
.
emplace_back
(
slice_max_layer
->
getOutput
(
0
));
// plugin_layer
auto
plugin_layer
=
engine_
->
network
()
->
addPluginV2
(
plugin_inputs
.
data
(),
plugin_inputs
.
size
(),
*
plugin
);
// add shuffle
auto
*
reshape_after_mha_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
plugin_layer
->
getOutput
(
0
));
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_tensor
;
reshape_tensor
.
push_back
(
batch_tensor
);
reshape_tensor
.
push_back
(
length_tensor
);
reshape_tensor
.
push_back
(
Add1DConstantLayer
(
-
1
));
reshape_after_mha_layer
->
setInput
(
1
,
*
Concat
(
reshape_tensor
));
reshape_after_mha_layer
->
setName
(
(
"shuffle_last_multihead_matmul(Output: "
+
output_name
+
")"
)
.
c_str
());
// return
layer
=
reshape_after_mha_layer
;
}
else
{
PADDLE_ENFORCE_EQ
(
input
->
getDimensions
().
nbDims
,
3
,
platform
::
errors
::
InvalidArgument
(
"The Input dim of the MultiheadMatMul should be 3, "
"but it's (%d) now."
,
input
->
getDimensions
().
nbDims
));
// transpose weight_data from m * n to n * m
auto
*
input_bias_qk
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"BiasQK"
).
front
());
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
static_cast
<
size_t
>
(
weight_t
->
numel
())};
weight
.
dims
.
assign
({
n
,
m
});
TensorRTEngine
::
Weight
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
bias_data
),
static_cast
<
size_t
>
(
bias_t
->
numel
())};
// add shuffle before fc
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_before_fc_shape_tensor
;
nvinfer1
::
ITensor
*
input_shape_tensor
=
Shape
(
input
);
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
reshape_before_fc_shape_tensor
.
push_back
(
Add1DConstantLayer
(
1
));
}
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
reshape_before_fc_shape_tensor
[
i
]
=
GetEleTensorOfShape
(
input_shape_tensor
,
i
);
}
auto
*
reshape_before_fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
input
);
if
(
op_desc
.
HasAttr
(
"Input_scale"
))
{
engine_
->
SetTensorDynamicRange
(
reshape_before_fc_layer
->
getOutput
(
0
),
in_scale
);
}
reshape_before_fc_layer
->
setInput
(
1
,
*
Concat
(
reshape_before_fc_shape_tensor
));
reshape_before_fc_layer
->
setName
(
(
"shuffle_before_multihead_mamul(Output: "
+
output_name
+
")"
)
.
c_str
());
// add layer fc
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
if
(
op_desc
.
HasAttr
(
"Input_scale"
))
{
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
reshape_before_fc_layer
->
getOutput
(
0
),
n
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
}
else
{
fc_layer
=
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
FullyConnected
,
*
reshape_before_fc_layer
->
getOutput
(
0
),
*
reshape_before_fc_layer
->
getOutput
(
0
),
n
,
n
,
weight
.
get
(),
weight
,
bias
.
get
());
bias
);
}
// add shuffle for CustomQKVToContextPluginDynamic layer
auto
*
reshape_after_fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
fc_layer
->
getOutput
(
0
));
std
::
vector
<
nvinfer1
::
ITensor
*>
mha_input_tensor_shape
;
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
-
1
));
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
hidden_out
*
3
));
// Q,K,V
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
1
));
mha_input_tensor_shape
.
push_back
(
Add1DConstantLayer
(
1
));
reshape_after_fc_layer
->
setInput
(
1
,
*
Concat
(
mha_input_tensor_shape
));
reshape_after_fc_layer
->
setName
(
(
"shuffle_after_fc_multihead_matmul(Output: "
+
output_name
+
")"
)
.
c_str
());
// add mha_plugin
auto
creator
=
GetPluginRegistry
()
->
getPluginCreator
(
"CustomQKVToContextPluginDynamic"
,
"2"
);
assert
(
creator
!=
nullptr
);
// set the attributes of mha_plugin
int
type
=
static_cast
<
int
>
(
nvinfer1
::
DataType
::
kHALF
);
int
var_seqlen
=
1
;
bool
has_mask
=
true
;
std
::
vector
<
nvinfer1
::
PluginField
>
fields
{
{
"hidden_size"
,
&
hidden_out
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"num_heads"
,
&
head_number
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"type_id"
,
&
type
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"has_mask"
,
&
has_mask
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
},
{
"var_seqlen"
,
&
var_seqlen
,
nvinfer1
::
PluginFieldType
::
kINT32
,
1
}};
nvinfer1
::
PluginFieldCollection
*
plugin_collection
=
static_cast
<
nvinfer1
::
PluginFieldCollection
*>
(
malloc
(
sizeof
(
*
plugin_collection
)
+
fields
.
size
()
*
sizeof
(
nvinfer1
::
PluginField
)));
// remember to free
plugin_collection
->
nbFields
=
static_cast
<
int
>
(
fields
.
size
());
plugin_collection
->
fields
=
fields
.
data
();
auto
plugin
=
creator
->
createPlugin
(
"CustomQKVToContextPluginDynamic"
,
plugin_collection
);
free
(
plugin_collection
);
// set inputs
std
::
vector
<
nvinfer1
::
ITensor
*>
plugin_inputs
;
// input_0 for plugin
plugin_inputs
.
emplace_back
(
reshape_after_fc_layer
->
getOutput
(
0
));
// input_1(fake) for plugin
std
::
vector
<
int
>
mask
=
{
1
};
nvinfer1
::
ITensor
*
mask_tensor
=
Add1DConstantLayer
(
mask
);
plugin_inputs
.
emplace_back
(
mask_tensor
);
// input_2 for plugin
std
::
vector
<
int
>
pos_id
=
{
0
};
int
max_batch
=
500
;
for
(
int
i
=
1
;
i
<
max_batch
;
i
++
)
{
pos_id
.
push_back
(
i
);
}
nvinfer1
::
ITensor
*
fake_pos_id_tensor
=
Add1DConstantLayer
(
pos_id
);
nvinfer1
::
ITensor
*
length_tensor
=
GetEleTensorOfShape
(
input_shape_tensor
,
1
);
auto
pos_id_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
fake_pos_id_tensor
,
*
length_tensor
,
nvinfer1
::
ElementWiseOperation
::
kPROD
);
// size = batch + 1;
nvinfer1
::
ITensor
*
batch_tensor
=
GetEleTensorOfShape
(
input_shape_tensor
,
0
);
std
::
vector
<
int
>
const_data
=
{
1
};
nvinfer1
::
ITensor
*
const_tensor
=
Add1DConstantLayer
(
const_data
);
auto
size_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
batch_tensor
,
*
const_tensor
,
nvinfer1
::
ElementWiseOperation
::
kSUM
);
// get size(batch + 1) data from pos_id_tensor
nvinfer1
::
Dims
start
;
nvinfer1
::
Dims
stride
;
nvinfer1
::
Dims
size
;
start
.
nbDims
=
1
;
stride
.
nbDims
=
1
;
size
.
nbDims
=
1
;
start
.
d
[
0
]
=
0
;
stride
.
d
[
0
]
=
1
;
size
.
d
[
0
]
=
1
;
auto
*
slice_pos_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Slice
,
*
pos_id_layer
->
getOutput
(
0
),
start
,
size
,
stride
);
slice_pos_layer
->
setInput
(
2
,
*
size_layer
->
getOutput
(
0
));
plugin_inputs
.
emplace_back
(
slice_pos_layer
->
getOutput
(
0
));
// input_3 for plugin
std
::
vector
<
int
>
data
(
500
,
1
);
nvinfer1
::
ITensor
*
fake_max_seqlen_tensor
=
Add1DConstantLayer
(
data
);
auto
*
slice_max_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Slice
,
*
fake_max_seqlen_tensor
,
start
,
size
,
stride
);
slice_max_layer
->
setInput
(
2
,
*
length_tensor
);
plugin_inputs
.
emplace_back
(
slice_max_layer
->
getOutput
(
0
));
// plugin_layer
auto
plugin_layer
=
engine_
->
network
()
->
addPluginV2
(
plugin_inputs
.
data
(),
plugin_inputs
.
size
(),
*
plugin
);
// add shuffle
auto
*
reshape_after_mha_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
plugin_layer
->
getOutput
(
0
));
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_tensor
;
reshape_tensor
.
push_back
(
batch_tensor
);
reshape_tensor
.
push_back
(
length_tensor
);
reshape_tensor
.
push_back
(
Add1DConstantLayer
(
-
1
));
reshape_after_mha_layer
->
setInput
(
1
,
*
Concat
(
reshape_tensor
));
reshape_after_mha_layer
->
setName
(
(
"shuffle_last_multihead_matmul(Output: "
+
output_name
+
")"
)
.
c_str
());
if
(
op_desc
.
HasAttr
(
"fc_out_threshold"
))
{
// return
layer
=
reshape_after_mha_layer
;
}
else
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"fc_out_threshold"
)
,
input
->
getDimensions
().
nbDims
,
true
,
3
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"must have out threshold in multihead layers in int8 mode"
));
"The Input dim of the MultiheadMatMul should be 3, "
float
out_scale
=
"but it's (%d) now."
,
PADDLE_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"fc_out_threshold"
));
input
->
getDimensions
().
nbDims
));
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
// transpose weight_data from m * n to n * m
}
auto
*
input_bias_qk
=
fc_layer
->
setName
(
engine_
->
GetITensor
(
op_desc
.
Input
(
"BiasQK"
).
front
());
(
"multihead_mamul_fc(Output: "
+
output_name
+
")"
).
c_str
());
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
static_cast
<
size_t
>
(
weight_t
->
numel
())};
weight
.
dims
.
assign
({
n
,
m
});
TensorRTEngine
::
Weight
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
bias_data
),
static_cast
<
size_t
>
(
bias_t
->
numel
())};
// add shuffle before fc
std
::
vector
<
nvinfer1
::
ITensor
*>
reshape_before_fc_shape_tensor
;
nvinfer1
::
ITensor
*
input_shape_tensor
=
Shape
(
input
);
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
reshape_before_fc_shape_tensor
.
push_back
(
Add1DConstantLayer
(
1
));
}
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
reshape_before_fc_shape_tensor
[
i
]
=
GetEleTensorOfShape
(
input_shape_tensor
,
i
);
}
auto
*
reshape_before_fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Shuffle
,
*
input
);
if
(
op_desc
.
HasAttr
(
"Input_scale"
))
{
engine_
->
SetTensorDynamicRange
(
reshape_before_fc_layer
->
getOutput
(
0
),
in_scale
);
}
reshape_before_fc_layer
->
setInput
(
1
,
*
Concat
(
reshape_before_fc_shape_tensor
));
reshape_before_fc_layer
->
setName
(
(
"shuffle_before_multihead_mamul(Output: "
+
output_name
+
")"
)
.
c_str
());
// add layer fc
nvinfer1
::
ILayer
*
fc_layer
=
nullptr
;
if
(
op_desc
.
HasAttr
(
"Input_scale"
))
{
nvinfer1
::
DimsHW
nv_ksize
(
1
,
1
);
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
reshape_before_fc_layer
->
getOutput
(
0
),
n
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
}
else
{
fc_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
reshape_before_fc_layer
->
getOutput
(
0
),
n
,
weight
.
get
(),
bias
.
get
());
}
if
(
op_desc
.
HasAttr
(
"fc_out_threshold"
))
{
PADDLE_ENFORCE_EQ
(
op_desc
.
HasAttr
(
"fc_out_threshold"
),
true
,
platform
::
errors
::
InvalidArgument
(
"must have out threshold in multihead layers "
"in int8 mode"
));
float
out_scale
=
PADDLE_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"fc_out_threshold"
));
engine_
->
SetTensorDynamicRange
(
fc_layer
->
getOutput
(
0
),
out_scale
);
}
fc_layer
->
setName
(
(
"multihead_mamul_fc(Output: "
+
output_name
+
")"
).
c_str
());
// no need to add shuffle after fc, just change it in
// no need to add shuffle after fc, just change it in
// QkvToContextPluginDynamic
// QkvToContextPluginDynamic
// add qkv to context
// add qkv to context
int
head_size
=
hidden_out
/
head_number
;
int
head_size
=
hidden_out
/
head_number
;
float
scale
=
PADDLE_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"alpha"
));
float
scale
=
PADDLE_GET_CONST
(
float
,
op_desc
.
GetAttr
(
"alpha"
));
std
::
vector
<
nvinfer1
::
ITensor
*>
plugin_inputs
;
std
::
vector
<
nvinfer1
::
ITensor
*>
plugin_inputs
;
plugin_inputs
.
push_back
(
fc_layer
->
getOutput
(
0
));
plugin_inputs
.
push_back
(
fc_layer
->
getOutput
(
0
));
plugin_inputs
.
push_back
(
input_bias_qk
);
plugin_inputs
.
push_back
(
input_bias_qk
);
bool
with_fp16
=
bool
with_fp16
=
engine_
->
WithFp16
()
&&
!
engine_
->
disable_trt_plugin_fp16
();
engine_
->
WithFp16
()
&&
!
engine_
->
disable_trt_plugin_fp16
();
if
(
engine_
->
precision
()
==
AnalysisConfig
::
Precision
::
kInt8
)
{
if
(
engine_
->
precision
()
==
AnalysisConfig
::
Precision
::
kInt8
)
{
with_fp16
=
true
;
with_fp16
=
true
;
}
plugin
::
DynamicPluginTensorRT
*
plugin
=
new
plugin
::
QkvToContextPluginDynamic
(
hidden_in
,
head_number
,
head_size
,
scale
,
with_fp16
);
layer
=
engine_
->
AddDynamicPlugin
(
plugin_inputs
.
data
(),
2
,
plugin
);
}
}
plugin
::
DynamicPluginTensorRT
*
plugin
=
new
plugin
::
QkvToContextPluginDynamic
(
hidden_in
,
head_number
,
head_size
,
scale
,
with_fp16
);
layer
=
engine_
->
AddDynamicPlugin
(
plugin_inputs
.
data
(),
2
,
plugin
);
}
}
}
else
{
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录