Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
6512e087
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6512e087
编写于
10月 10, 2022
作者:
W
Wangzheee
提交者:
GitHub
10月 10, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Paddle Inference]fix embedding fused (#46789)
* fix embedding fused
上级
ae6b4713
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
1203 addition
and
275 deletion
+1203
-275
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
...fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+3
-3
paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
...inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+1
-1
paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu
...nsorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu
+403
-61
paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu
...nsorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu
+419
-71
paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.cu
...ce/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.cu
+263
-115
paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h
...nce/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h
+114
-24
未找到文件。
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
浏览文件 @
6512e087
...
...
@@ -210,14 +210,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
"max_seqlen_tensor"
));
// max_seqlen, eval_placeholder_3
auto
creator
=
GetPluginRegistry
()
->
getPluginCreator
(
"ManyEmbLayerNormPluginDynamic"
,
"
2
"
);
"ManyEmbLayerNormPluginDynamic"
,
"
1
"
);
auto
plugin_obj
=
creator
->
createPlugin
(
"ManyEmbLayerNormPluginDynamic"
,
plugin_ptr
);
auto
plugin_layer
=
engine_
->
network
()
->
addPluginV2
(
plugin_inputs
.
data
(),
plugin_inputs
.
size
(),
*
plugin_obj
);
plugin_layer
->
setName
((
"ManyEmbLayerNormPluginDynamic_V
2
(Output: "
+
plugin_layer
->
setName
((
"ManyEmbLayerNormPluginDynamic_V
1
(Output: "
+
op_desc
.
Output
(
"Out"
)[
0
]
+
")"
)
.
c_str
());
free
(
plugin_ptr
);
...
...
@@ -248,7 +248,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
layer
=
plugin_layer
;
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
RreplenishLayerAndOutput
(
layer
,
"ManyEmbLayerNormPluginDynamic_V
2
"
,
"ManyEmbLayerNormPluginDynamic_V
1
"
,
{
output_name
,
std
::
string
(
"qkv_plugin_mask"
)},
test_mode
);
}
...
...
paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
浏览文件 @
6512e087
...
...
@@ -194,7 +194,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
"max_seqlen_tensor"
));
// max_seqlen, eval_placeholder_3
auto
creator
=
GetPluginRegistry
()
->
getPluginCreator
(
"ManyEmbLayerNormPluginDynamic"
,
"
3
"
);
"ManyEmbLayerNormPluginDynamic"
,
"
2
"
);
auto
plugin_obj
=
creator
->
createPlugin
(
"ManyEmbLayerNormPluginDynamic"
,
plugin_ptr
);
...
...
paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelHFace.cu
浏览文件 @
6512e087
...
...
@@ -30,20 +30,22 @@ namespace tensorrt {
namespace
plugin
{
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelHFace
(
int32_t
ld
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
__global__
void
embLayerNormKernelHFace_2
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
mIdsEmbDev
,
int32_t
*
IdsSize
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
T
*
output
)
{
cub
::
Sum
pairSum
;
int32_t
const
s
=
blockIdx
.
x
;
int32_t
const
b
=
blockIdx
.
y
;
int32_t
*
cuSeqlens
=
inputIds
[
0
];
int32_t
const
sumS
=
cuSeqlens
[
b
];
int32_t
const
s_b
=
cuSeqlens
[
b
+
1
]
-
sumS
;
int32_t
const
sumS
=
inputIds0
[
b
];
int32_t
const
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
...
...
@@ -52,17 +54,87 @@ __global__ void embLayerNormKernelHFace(int32_t ld,
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
for
(
int
i
=
1
;
i
<
nbLookupTables
;
++
i
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds
[
i
])[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds
[
i
])[
seqPos
]
>=
IdsSize
[
i
])
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
i
-
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds
[
i
]
)[
seqPos
];
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
}
__syncthreads
();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t
const
poffset
=
blockIdx
.
x
*
ld
;
int32_t
const
outOffset
=
seqPos
*
ld
;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
int32_t
const
offset
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset
+
it
];
output
[
outOffset
+
it
]
=
val
;
T
const
rldval
=
rld
*
val
;
threadData
=
pairSum
(
threadData
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelHFace_3
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
T
*
output
)
{
cub
::
Sum
pairSum
;
int32_t
const
s
=
blockIdx
.
x
;
int32_t
const
b
=
blockIdx
.
y
;
int32_t
const
sumS
=
inputIds0
[
b
];
int32_t
const
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
T
const
rld
=
T
(
1.
f
)
/
T
(
ld
);
int32_t
const
seqPos
=
sumS
+
s
;
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
>=
IdsSize2
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
];
}
}
__syncthreads
();
...
...
@@ -74,12 +146,101 @@ __global__ void embLayerNormKernelHFace(int32_t ld,
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev
[
0
]
[
poffset
+
it
]);
// pos id
T
p
(
mIdsEmbDev
0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
for
(
int
i
=
1
;
i
<
nbLookupTables
;
++
i
)
{
int32_t
const
offset
=
word_id
[
i
-
1
]
*
ld
;
val
+=
mIdsEmbDev
[
i
][
offset
+
it
];
int32_t
const
offset0
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset0
+
it
];
int32_t
const
offset1
=
word_id
[
1
]
*
ld
;
val
+=
mIdsEmbDev2
[
offset1
+
it
];
output
[
outOffset
+
it
]
=
val
;
T
const
rldval
=
rld
*
val
;
threadData
=
pairSum
(
threadData
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelHFace_4
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
const
*
inputIds3
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
T
const
*
mIdsEmbDev3
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
int32_t
IdsSize3
,
T
*
output
)
{
cub
::
Sum
pairSum
;
int32_t
const
s
=
blockIdx
.
x
;
int32_t
const
b
=
blockIdx
.
y
;
int32_t
const
sumS
=
inputIds0
[
b
];
int32_t
const
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
T
const
rld
=
T
(
1.
f
)
/
T
(
ld
);
int32_t
const
seqPos
=
sumS
+
s
;
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
>=
IdsSize2
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
]
>=
IdsSize3
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
2
]
=
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
];
}
}
__syncthreads
();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t
const
poffset
=
blockIdx
.
x
*
ld
;
int32_t
const
outOffset
=
seqPos
*
ld
;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
int32_t
const
offset0
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset0
+
it
];
int32_t
const
offset1
=
word_id
[
1
]
*
ld
;
val
+=
mIdsEmbDev2
[
offset1
+
it
];
int32_t
const
offset2
=
word_id
[
2
]
*
ld
;
val
+=
mIdsEmbDev3
[
offset2
+
it
];
output
[
outOffset
+
it
]
=
val
;
T
const
rldval
=
rld
*
val
;
...
...
@@ -89,52 +250,233 @@ __global__ void embLayerNormKernelHFace(int32_t ld,
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
>
int32_t
embSkipLayerNormHFace_2
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int
const
*
inputIds0
,
int
const
*
inputIds1
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
T
*
output
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelHFace_2
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds0
,
inputIds1
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev0
,
mIdsEmbDev1
,
IdsSize0
,
IdsSize1
,
output
);
return
cudaPeekAtLastError
();
}
template
<
typename
T
>
int32_t
embSkipLayerNormHFace_3
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int
const
*
inputIds0
,
int
const
*
inputIds1
,
int
const
*
inputIds2
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
T
*
output
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelHFace_3
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds0
,
inputIds1
,
inputIds2
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev0
,
mIdsEmbDev1
,
mIdsEmbDev2
,
IdsSize0
,
IdsSize1
,
IdsSize2
,
output
);
return
cudaPeekAtLastError
();
}
template
<
typename
T
>
int32_t
embSkipLayerNormHFace
(
cudaStream_t
stream
,
int32_t
embSkipLayerNormHFace
_4
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
int
const
*
inputIds0
,
int
const
*
inputIds1
,
int
const
*
inputIds2
,
int
const
*
inputIds3
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
mIdsEmbDev
,
int32_t
*
IdsSize
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
T
const
*
mIdsEmbDev3
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
int32_t
IdsSize3
,
T
*
output
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelHFace
<
T
,
tpb
><<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev
,
IdsSize
,
output
);
embLayerNormKernelHFace_4
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds0
,
inputIds1
,
inputIds2
,
inputIds3
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev0
,
mIdsEmbDev1
,
mIdsEmbDev2
,
mIdsEmbDev3
,
IdsSize0
,
IdsSize1
,
IdsSize2
,
IdsSize3
,
output
);
return
cudaPeekAtLastError
();
}
template
int32_t
embSkipLayerNormHFace
<
float
>(
cudaStream_t
,
template
int32_t
embSkipLayerNormHFace_2
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
int32_t
,
int32_t
,
float
*
);
template
int32_t
embSkipLayerNormHFace_3
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
**
,
int32_t
const
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
int32_t
,
int32_t
,
int32_t
,
float
*
);
template
int32_t
embSkipLayerNormHFace_4
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
**
,
int32_t
*
,
float
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
float
*
);
template
int32_t
embSkipLayerNormHFace
<
half
>(
cudaStream_t
,
template
int32_t
embSkipLayerNormHFace_2
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
half
*
);
template
int32_t
embSkipLayerNormHFace_3
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
**
,
int32_t
const
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
**
,
int32_t
*
,
half
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
int32_t
,
half
*
);
template
int32_t
embSkipLayerNormHFace_4
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
const
*
,
half
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
half
*
);
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernelMTron.cu
浏览文件 @
6512e087
...
...
@@ -30,121 +30,469 @@ namespace tensorrt {
namespace
plugin
{
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelMTron
(
int32_t
ld
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
__global__
void
embLayerNormKernelMTron_2
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
mIdsEmbDev
,
int32_t
*
IdsSize
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
T
*
output
,
T
*
skip
)
{
cub
::
Sum
pairSum
;
int32_t
const
s
=
blockIdx
.
x
;
int32_t
const
b
=
blockIdx
.
y
;
int32_t
*
cuSeqlens
=
inputIds
[
0
];
int32_t
const
sumS
=
cuSeqlens
[
b
];
int32_t
const
s_b
=
cuSeqlens
[
b
+
1
]
-
sumS
;
int32_t
const
sumS
=
inputIds0
[
b
];
int32_t
const
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
T
const
rld
=
T
(
1.
f
)
/
T
(
ld
);
int32_t
cons
t
seqPos
=
sumS
+
s
;
const
int32_
t
seqPos
=
sumS
+
s
;
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
for
(
int
i
=
1
;
i
<
nbLookupTables
;
++
i
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds
[
i
])[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds
[
i
])[
seqPos
]
>=
IdsSize
[
i
])
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error !!!!!!!!!!!!!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot
"
"be lookup
table: ID < 0 or ID > max "
);
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup
"
"
table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
i
-
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds
[
i
]
)[
seqPos
];
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
}
__syncthreads
();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
const
int32_t
poffset
=
blockIdx
.
x
*
ld
;
const
int32_t
outOffset
=
seqPos
*
ld
;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
const
int32_t
offset
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset
+
it
];
output
[
outOffset
+
it
]
=
val
;
skip
[
outOffset
+
it
]
=
val
;
const
T
rldval
=
rld
*
val
;
threadData
=
pairSum
(
threadData
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelMTron_3
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
T
*
output
,
T
*
skip
)
{
cub
::
Sum
pairSum
;
const
int32_t
s
=
blockIdx
.
x
;
const
int32_t
b
=
blockIdx
.
y
;
const
int32_t
sumS
=
inputIds0
[
b
];
const
int32_t
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
const
T
rld
=
T
(
1.
f
)
/
T
(
ld
);
const
int32_t
seqPos
=
sumS
+
s
;
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
>=
IdsSize2
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
];
}
}
__syncthreads
();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t
cons
t
poffset
=
blockIdx
.
x
*
ld
;
int32_t
cons
t
outOffset
=
seqPos
*
ld
;
const
int32_
t
poffset
=
blockIdx
.
x
*
ld
;
const
int32_
t
outOffset
=
seqPos
*
ld
;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev
[
0
]
[
poffset
+
it
]);
// pos id
T
p
(
mIdsEmbDev
0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
for
(
int
i
=
1
;
i
<
nbLookupTables
;
++
i
)
{
int32_t
const
offset
=
word_id
[
i
-
1
]
*
ld
;
val
+=
mIdsEmbDev
[
i
][
offset
+
it
];
const
int32_t
offset0
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset0
+
it
];
const
int32_t
offset1
=
word_id
[
1
]
*
ld
;
val
+=
mIdsEmbDev2
[
offset1
+
it
];
output
[
outOffset
+
it
]
=
val
;
skip
[
outOffset
+
it
]
=
val
;
const
T
rldval
=
rld
*
val
;
threadData
=
pairSum
(
threadData
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
,
unsigned
TPB
>
__global__
void
embLayerNormKernelMTron_4
(
int32_t
ld
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
const
*
inputIds3
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
T
const
*
mIdsEmbDev3
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
int32_t
IdsSize3
,
T
*
output
,
T
*
skip
)
{
cub
::
Sum
pairSum
;
const
int32_t
s
=
blockIdx
.
x
;
const
int32_t
b
=
blockIdx
.
y
;
const
int32_t
sumS
=
inputIds0
[
b
];
const
int32_t
s_b
=
inputIds0
[
b
+
1
]
-
sumS
;
if
(
s
>=
s_b
)
{
return
;
// This CTA has nothing to do
}
const
T
rld
=
T
(
1.
f
)
/
T
(
ld
);
const
int32_t
seqPos
=
sumS
+
s
;
extern
__shared__
int32_t
word_id
[];
if
(
threadIdx
.
x
==
0
)
{
if
(
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
]
>=
IdsSize1
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
0
]
=
static_cast
<
int32_t
const
*>
(
inputIds1
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
]
>=
IdsSize2
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
1
]
=
static_cast
<
int32_t
const
*>
(
inputIds2
)[
seqPos
];
}
if
(
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
]
<
0
||
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
]
>=
IdsSize3
)
{
printf
(
"Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
"table: ID < 0 or ID > max "
);
return
;
}
else
{
word_id
[
2
]
=
static_cast
<
int32_t
const
*>
(
inputIds3
)[
seqPos
];
}
}
__syncthreads
();
// 2. load pos/tok/word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
const
int32_t
poffset
=
blockIdx
.
x
*
ld
;
const
int32_t
outOffset
=
seqPos
*
ld
;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
kvp
<
T
>
threadData
(
0
,
0
);
for
(
int32_t
it
=
threadIdx
.
x
;
it
<
ld
;
it
+=
TPB
)
{
T
p
(
mIdsEmbDev0
[
poffset
+
it
]);
// pos id
T
val
=
p
;
const
int32_t
offset0
=
word_id
[
0
]
*
ld
;
val
+=
mIdsEmbDev1
[
offset0
+
it
];
const
int32_t
offset1
=
word_id
[
1
]
*
ld
;
val
+=
mIdsEmbDev2
[
offset1
+
it
];
const
int32_t
offset2
=
word_id
[
2
]
*
ld
;
val
+=
mIdsEmbDev3
[
offset2
+
it
];
output
[
outOffset
+
it
]
=
val
;
skip
[
outOffset
+
it
]
=
val
;
T
const
rldval
=
rld
*
val
;
const
T
rldval
=
rld
*
val
;
threadData
=
pairSum
(
threadData
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
// 3. layer norm on the sum
layerNorm
<
T
,
T
,
float
,
TPB
>
(
threadData
,
ld
,
outOffset
,
beta
,
gamma
,
output
);
}
template
<
typename
T
>
int32_t
embSkipLayerNormMTron_2
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
T
*
output
,
T
*
skip
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelMTron_2
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds0
,
inputIds1
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev0
,
mIdsEmbDev1
,
IdsSize0
,
IdsSize1
,
output
,
skip
);
return
cudaPeekAtLastError
();
}
template
<
typename
T
>
int32_t
embSkipLayerNormMTron_3
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
T
*
output
,
T
*
skip
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelMTron_3
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds0
,
inputIds1
,
inputIds2
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev0
,
mIdsEmbDev1
,
mIdsEmbDev2
,
IdsSize0
,
IdsSize1
,
IdsSize2
,
output
,
skip
);
return
cudaPeekAtLastError
();
}
template
<
typename
T
>
int32_t
embSkipLayerNormMTron
(
cudaStream_t
stream
,
int32_t
embSkipLayerNormMTron
_4
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
int32_t
const
*
inputIds0
,
int32_t
const
*
inputIds1
,
int32_t
const
*
inputIds2
,
int32_t
const
*
inputIds3
,
int32_t
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
mIdsEmbDev
,
int32_t
*
IdsSize
,
T
const
*
mIdsEmbDev0
,
T
const
*
mIdsEmbDev1
,
T
const
*
mIdsEmbDev2
,
T
const
*
mIdsEmbDev3
,
int32_t
IdsSize0
,
int32_t
IdsSize1
,
int32_t
IdsSize2
,
int32_t
IdsSize3
,
T
*
output
,
T
*
skip
)
{
constexpr
int32_t
tpb
=
256
;
dim3
const
grid
(
S
,
B
,
1
);
dim3
const
block
(
tpb
,
1
,
1
);
size_t
cache_size
=
sizeof
(
int32_t
)
*
(
nbLookupTables
-
1
);
embLayerNormKernelMTron
<
T
,
tpb
>
embLayerNormKernelMTron
_4
<
T
,
tpb
>
<<<
grid
,
block
,
cache_size
,
stream
>>>
(
ld
,
inputIds
,
inputIds0
,
inputIds1
,
inputIds2
,
inputIds3
,
nbLookupTables
,
beta
,
gamma
,
mIdsEmbDev
,
IdsSize
,
mIdsEmbDev0
,
mIdsEmbDev1
,
mIdsEmbDev2
,
mIdsEmbDev3
,
IdsSize0
,
IdsSize1
,
IdsSize2
,
IdsSize3
,
output
,
skip
);
return
cudaPeekAtLastError
();
}
template
int32_t
embSkipLayerNormMTron
<
float
>(
cudaStream_t
,
template
int32_t
embSkipLayerNormMTron
_2
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
**
,
int32_t
const
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
**
,
int32_t
*
,
int32_t
,
int32_t
,
float
*
,
float
*
);
template
int32_t
embSkipLayerNormMTron
<
half
>(
cudaStream_t
,
template
int32_t
embSkipLayerNormMTron_3
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
int32_t
**
,
int32_t
const
,
float
const
*
,
float
const
*
,
half
**
,
int32_t
*
,
float
const
*
,
float
const
*
,
float
const
*
,
int32_t
,
int32_t
,
int32_t
,
float
*
,
float
*
);
template
int32_t
embSkipLayerNormMTron_4
<
float
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
float
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
float
*
,
float
*
);
template
int32_t
embSkipLayerNormMTron_2
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
half
*
,
half
*
);
template
int32_t
embSkipLayerNormMTron_3
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
int32_t
,
half
*
,
half
*
);
template
int32_t
embSkipLayerNormMTron_4
<
half
>(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
half
const
*
,
half
const
*
,
half
const
*
,
half
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
half
*
,
half
*
);
...
...
paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.cu
浏览文件 @
6512e087
...
...
@@ -37,8 +37,8 @@ constexpr size_t xmmasM384 = 24;
constexpr
size_t
packedMaskSize128
=
xmmasM128
*
threadsPerCta128
;
constexpr
size_t
packedMaskSize256
=
xmmasM256
*
threadsPerCta256
;
constexpr
size_t
packedMaskSize384
=
xmmasM384
*
threadsPerCta384
;
char
const
*
EMB_LAYER_NORM_VAR_SEQLEN_VERSION_HFACE
{
"
2
"
};
char
const
*
EMB_LAYER_NORM_VAR_SEQLEN_VERSION_MTRON
{
"
3
"
};
char
const
*
EMB_LAYER_NORM_VAR_SEQLEN_VERSION_HFACE
{
"
1
"
};
char
const
*
EMB_LAYER_NORM_VAR_SEQLEN_VERSION_MTRON
{
"
2
"
};
char
const
*
EMB_LAYER_NORM_VAR_SEQLEN_NAME
{
"ManyEmbLayerNormPluginDynamic"
};
// Static class fields initialization
nvinfer1
::
PluginFieldCollection
EmbLayerNormVarSeqlenPluginBaseCreator
::
mFC
{};
...
...
@@ -74,7 +74,7 @@ EmbLayerNormVarSeqlenPluginBase::EmbLayerNormVarSeqlenPluginBase(
tem_weight
.
values
,
getWeightsSize
(
tem_weight
,
mType
),
cudaMemcpyHostToDevice
));
mIdsEmb
Dev
.
push_back
(
cudaMem
);
mIdsEmb
Ptrs
.
push_back
(
cudaMem
);
}
}
...
...
@@ -83,7 +83,7 @@ EmbLayerNormVarSeqlenPluginBase::EmbLayerNormVarSeqlenPluginBase(
:
mLayerName
(
name
),
mGammaDev
(
nullptr
),
mBetaDev
(
nullptr
),
mIdsEmb
Dev
{},
mIdsEmb
Ptrs
{},
mIdsEmb_
{}
{
// Deserialize in the same order as serialization
deserialize_value
(
&
data
,
&
length
,
&
mType
);
...
...
@@ -141,8 +141,8 @@ EmbLayerNormVarSeqlenPluginMTron::EmbLayerNormVarSeqlenPluginMTron(
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
EmbLayerNormVarSeqlenPluginHFace
::
clone
()
const
noexcept
{
TRANSFORMER_DEBUG_MSG
(
"EmbLayerNormVarSeqlenPlugin
MTron
clone"
);
auto
p
=
new
EmbLayerNormVarSeqlenPlugin
MTron
(
TRANSFORMER_DEBUG_MSG
(
"EmbLayerNormVarSeqlenPlugin
HFace
clone"
);
auto
p
=
new
EmbLayerNormVarSeqlenPlugin
HFace
(
mLayerName
,
mType
,
mBeta
,
mGamma
,
mIdsEmb_
);
p
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
p
;
...
...
@@ -333,7 +333,7 @@ int32_t EmbLayerNormVarSeqlenPluginHFace::enqueue(
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
noexcept
{
int32_t
const
batchSize
=
inputDesc
[
0
].
dims
.
d
[
0
]
-
1
;
int32_t
batchSize
=
inputDesc
[
0
].
dims
.
d
[
0
]
-
1
;
// read out the maximum sequence length from the dummy input
int32_t
const
maxSeqlen
=
inputDesc
[
nbLookupTables_
].
dims
.
d
[
0
];
int32_t
S
=
384
;
...
...
@@ -346,60 +346,132 @@ int32_t EmbLayerNormVarSeqlenPluginHFace::enqueue(
}
const
float
*
beta
=
mBetaDev
.
get
();
const
float
*
gamma
=
mGammaDev
.
get
();
int32_t
**
tem_inputs_ptr_dev
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
tem_inputs_ptr_dev
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
tem_inputs_ptr_dev
,
inputs
,
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
int32_t
*
mIdsVocabSize_dev
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsVocabSize_dev
),
sizeof
(
int32_t
)
*
mIdsVocabSize
.
size
());
cudaMemcpy
(
mIdsVocabSize_dev
,
&
(
mIdsVocabSize
[
0
]),
sizeof
(
int32_t
)
*
mIdsVocabSize
.
size
(),
cudaMemcpyHostToDevice
);
if
(
mType
==
nvinfer1
::
DataType
::
kFLOAT
)
{
auto
output
=
static_cast
<
float
*>
(
outputs
[
0
]);
float
**
mIdsEmbDev_float
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsEmbDev_float
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
mIdsEmbDev_float
,
&
(
mIdsEmbDev
[
0
]),
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
return
embSkipLayerNormHFace
<
float
>
(
stream
,
if
(
nbLookupTables_
==
2
)
{
return
embSkipLayerNormHFace_2
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
output
);
}
else
if
(
nbLookupTables_
==
3
)
{
return
embSkipLayerNormHFace_3
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
tem_inputs_ptr_dev
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
nbLookupTables_
,
beta
,
gamma
,
mIdsEmbDev_float
,
mIdsVocabSize_dev
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
2
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
output
);
}
else
if
(
nbLookupTables_
==
4
)
{
return
embSkipLayerNormHFace_4
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
static_cast
<
int32_t
const
*>
(
inputs
[
3
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
2
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
3
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
mIdsVocabSize
[
3
],
output
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support 2,3,4 lookup_tables fused "
));
}
}
else
if
(
mType
==
nvinfer1
::
DataType
::
kHALF
)
{
auto
output
=
static_cast
<
half
*>
(
outputs
[
0
]);
half
**
mIdsEmbDev_half
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsEmbDev_half
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
mIdsEmbDev_half
,
&
(
mIdsEmbDev
[
0
]),
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
return
embSkipLayerNormHFace
<
half
>
(
stream
,
if
(
nbLookupTables_
==
2
)
{
return
embSkipLayerNormHFace_2
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
tem_inputs_ptr_dev
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
nbLookupTables_
,
beta
,
gamma
,
mIdsEmbDev_half
,
mIdsVocabSize_dev
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
output
);
}
else
if
(
nbLookupTables_
==
3
)
{
return
embSkipLayerNormHFace_3
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
2
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
output
);
}
else
if
(
nbLookupTables_
==
4
)
{
return
embSkipLayerNormHFace_4
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
static_cast
<
int32_t
const
*>
(
inputs
[
3
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
2
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
3
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
mIdsVocabSize
[
3
],
output
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support 2,3,4 lookup_tables fused "
));
}
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unsupported type error, expected [kHALF,kFLOAT]"
));
...
...
@@ -414,7 +486,7 @@ int32_t EmbLayerNormVarSeqlenPluginMTron::enqueue(
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
noexcept
{
int32_t
const
batchSize
=
inputDesc
[
0
].
dims
.
d
[
0
]
-
1
;
int32_t
batchSize
=
inputDesc
[
0
].
dims
.
d
[
0
]
-
1
;
// read out the maximum sequence length from the dummy input
int32_t
const
maxSeqlen
=
inputDesc
[
nbLookupTables_
].
dims
.
d
[
0
];
int32_t
S
=
384
;
...
...
@@ -427,64 +499,141 @@ int32_t EmbLayerNormVarSeqlenPluginMTron::enqueue(
}
const
float
*
beta
=
mBetaDev
.
get
();
const
float
*
gamma
=
mGammaDev
.
get
();
int32_t
**
tem_inputs_ptr_dev
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
tem_inputs_ptr_dev
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
tem_inputs_ptr_dev
,
inputs
,
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
int32_t
*
mIdsVocabSize_dev
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsVocabSize_dev
),
sizeof
(
int32_t
)
*
mIdsVocabSize
.
size
());
cudaMemcpy
(
mIdsVocabSize_dev
,
&
(
mIdsVocabSize
[
0
]),
sizeof
(
int32_t
)
*
mIdsVocabSize
.
size
(),
cudaMemcpyHostToDevice
);
if
(
mType
==
nvinfer1
::
DataType
::
kFLOAT
)
{
auto
output
=
static_cast
<
float
*>
(
outputs
[
0
]);
auto
skip
=
static_cast
<
float
*>
(
outputs
[
1
]);
float
**
mIdsEmbDev_float
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsEmbDev_float
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
mIdsEmbDev_float
,
&
(
mIdsEmbDev
[
0
]),
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
return
embSkipLayerNormMTron
<
float
>
(
stream
,
if
(
nbLookupTables_
==
2
)
{
return
embSkipLayerNormMTron_2
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
output
,
skip
);
}
else
if
(
nbLookupTables_
==
3
)
{
return
embSkipLayerNormMTron_3
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
tem_inputs_ptr_dev
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
nbLookupTables_
,
beta
,
gamma
,
mIdsEmbDev_float
,
mIdsVocabSize_dev
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
2
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
output
,
skip
);
}
else
if
(
nbLookupTables_
==
4
)
{
return
embSkipLayerNormMTron_4
<
float
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
static_cast
<
int32_t
const
*>
(
inputs
[
3
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
2
]),
static_cast
<
float
const
*>
(
mIdsEmbPtrs
[
3
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
mIdsVocabSize
[
3
],
output
,
skip
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support 2,3,4 lookup_tables fused "
));
}
}
else
if
(
mType
==
nvinfer1
::
DataType
::
kHALF
)
{
auto
output
=
static_cast
<
half
*>
(
outputs
[
0
]);
auto
skip
=
static_cast
<
half
*>
(
outputs
[
1
]);
half
**
mIdsEmbDev_half
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
mIdsEmbDev_half
),
sizeof
(
void
*
)
*
nbLookupTables_
);
cudaMemcpy
(
mIdsEmbDev_half
,
&
(
mIdsEmbDev
[
0
]),
sizeof
(
void
*
)
*
nbLookupTables_
,
cudaMemcpyHostToDevice
);
return
embSkipLayerNormMTron
<
half
>
(
stream
,
if
(
nbLookupTables_
==
2
)
{
return
embSkipLayerNormMTron_2
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
output
,
skip
);
}
else
if
(
nbLookupTables_
==
3
)
{
return
embSkipLayerNormMTron_3
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
tem_inputs_ptr_dev
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
nbLookupTables_
,
beta
,
gamma
,
mIdsEmbDev_half
,
mIdsVocabSize_dev
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
2
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
output
,
skip
);
}
else
if
(
nbLookupTables_
==
4
)
{
return
embSkipLayerNormMTron_4
<
half
>
(
stream
,
static_cast
<
int32_t
>
(
mLd
),
batchSize
,
S
,
static_cast
<
int32_t
const
*>
(
inputs
[
0
]),
static_cast
<
int32_t
const
*>
(
inputs
[
1
]),
static_cast
<
int32_t
const
*>
(
inputs
[
2
]),
static_cast
<
int32_t
const
*>
(
inputs
[
3
]),
nbLookupTables_
,
beta
,
gamma
,
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
0
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
1
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
2
]),
static_cast
<
half
const
*>
(
mIdsEmbPtrs
[
3
]),
mIdsVocabSize
[
0
],
mIdsVocabSize
[
1
],
mIdsVocabSize
[
2
],
mIdsVocabSize
[
3
],
output
,
skip
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support 2,3,4 lookup_tables fused "
));
}
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unsupported type error, expected [kHALF,kFLOAT]"
));
...
...
@@ -566,9 +715,9 @@ void EmbLayerNormVarSeqlenPluginBase::serialize(void* buffer) const noexcept {
size_t
const
wordSize
=
getElementSize
(
mType
);
serFromDev
(
&
d
,
mBetaDev
.
get
(),
mLd
);
serFromDev
(
&
d
,
mGammaDev
.
get
(),
mLd
);
for
(
size_t
i
=
0
;
i
<
mIdsEmb
Dev
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
mIdsEmb
Ptrs
.
size
();
++
i
)
{
serFromDev
(
&
d
,
static_cast
<
char
*>
(
mIdsEmb
Dev
[
i
]),
static_cast
<
char
*>
(
mIdsEmb
Ptrs
[
i
]),
mLd
*
mIdsVocabSize
[
i
]
*
wordSize
);
}
}
...
...
@@ -577,8 +726,8 @@ void EmbLayerNormVarSeqlenPluginBase::destroy() noexcept {
// This gets called when the network containing plugin is destroyed
mBetaDev
.
reset
(
nullptr
);
mGammaDev
.
reset
(
nullptr
);
for
(
size_t
i
=
0
;
i
<
mIdsEmb
Dev
.
size
();
++
i
)
{
cudaFree
(
mIdsEmb
Dev
[
i
]);
for
(
size_t
i
=
0
;
i
<
mIdsEmb
Ptrs
.
size
();
++
i
)
{
cudaFree
(
mIdsEmb
Ptrs
[
i
]);
}
delete
this
;
}
...
...
@@ -680,7 +829,6 @@ nvinfer1::IPluginV2* EmbLayerNormVarSeqlenPluginHFaceCreator::createPlugin(
beta
,
gamma
,
IdsEmb
);
return
p
;
}
...
...
paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h
浏览文件 @
6512e087
...
...
@@ -31,32 +31,121 @@ namespace tensorrt {
namespace
plugin
{
template
<
typename
T
>
int32_t
embSkipLayerNormHFace
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
idsEmb
,
int32_t
*
,
T
*
output
);
int32_t
embSkipLayerNormHFace_2
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
T
*
);
template
<
typename
T
>
int32_t
embSkipLayerNormMTron
(
cudaStream_t
stream
,
int32_t
ld
,
int32_t
B
,
int32_t
S
,
int32_t
**
inputIds
,
int32_t
const
nbLookupTables
,
float
const
*
beta
,
float
const
*
gamma
,
T
**
idsEmb
,
int32_t
*
,
T
*
output
,
T
*
skip
);
int32_t
embSkipLayerNormHFace_3
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
int32_t
,
T
*
);
template
<
typename
T
>
int32_t
embSkipLayerNormHFace_4
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
T
*
);
template
<
typename
T
>
int32_t
embSkipLayerNormMTron_2
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
T
*
,
T
*
);
template
<
typename
T
>
int32_t
embSkipLayerNormMTron_3
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
int32_t
,
T
*
,
T
*
);
template
<
typename
T
>
int32_t
embSkipLayerNormMTron_4
(
cudaStream_t
,
int32_t
,
int32_t
,
int32_t
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
const
*
,
int32_t
,
float
const
*
,
float
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
T
const
*
,
int32_t
,
int32_t
,
int32_t
,
int32_t
,
T
*
,
T
*
);
class
EmbLayerNormVarSeqlenPluginBase
:
public
nvinfer1
::
IPluginV2DynamicExt
{
public:
EmbLayerNormVarSeqlenPluginBase
(
...
...
@@ -104,7 +193,8 @@ class EmbLayerNormVarSeqlenPluginBase : public nvinfer1::IPluginV2DynamicExt {
std
::
string
mNamespace
;
cuda_unique_ptr
<
float
>
mGammaDev
;
cuda_unique_ptr
<
float
>
mBetaDev
;
std
::
vector
<
void
*>
mIdsEmbDev
;
std
::
vector
<
void
*>
mIdsEmbPtrs
;
// std::vector<void*> mIdsEmbDev;
size_t
mLd
;
// leading dim = hidden size
std
::
vector
<
int32_t
>
mIdsVocabSize
;
WeightsWithOwnership
mBeta
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录