Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
9d5003dc
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9d5003dc
编写于
9月 14, 2022
作者:
W
wenbin
提交者:
GitHub
9月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix compile (#45996) (#46027)
上级
e223cf7b
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
25 addition
and
4 deletion
+25
-4
paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
...inference/tensorrt/plugin/layernorm_shift_partition_op.cu
+25
-4
未找到文件。
paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
浏览文件 @
9d5003dc
...
@@ -92,8 +92,12 @@ __global__ void layernorm_shift_partition(T *out,
...
@@ -92,8 +92,12 @@ __global__ void layernorm_shift_partition(T *out,
float
mean
=
0.0
f
;
float
mean
=
0.0
f
;
float
variance
=
0.0
f
;
float
variance
=
0.0
f
;
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
float
local_out
=
float
local_out
=
(
tid
<
n
)
?
static_cast
<
float
>
(
__ldg
(
input
+
bid
*
n
+
tid
))
:
0.0
f
;
(
tid
<
n
)
?
static_cast
<
float
>
(
__ldg
(
input
+
bid
*
n
+
tid
))
:
0.0
f
;
#else
float
local_out
=
(
tid
<
n
)
?
static_cast
<
float
>
(
input
[
bid
*
n
+
tid
])
:
0.0
f
;
#endif
mean
=
blockReduceSum
<
float
>
(
local_out
);
mean
=
blockReduceSum
<
float
>
(
local_out
);
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
...
@@ -109,14 +113,20 @@ __global__ void layernorm_shift_partition(T *out,
...
@@ -109,14 +113,20 @@ __global__ void layernorm_shift_partition(T *out,
__syncthreads
();
__syncthreads
();
if
(
tid
<
n
)
{
if
(
tid
<
n
)
{
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
out
[
output_bid
*
n
+
tid
]
=
out
[
output_bid
*
n
+
tid
]
=
(
T
)(((
local_out
-
s_mean
)
*
rsqrtf
(
s_variance
))
*
(
T
)(((
local_out
-
s_mean
)
*
rsqrtf
(
s_variance
))
*
static_cast
<
float
>
(
__ldg
(
&
gamma
[
tid
]))
+
static_cast
<
float
>
(
__ldg
(
&
gamma
[
tid
]))
+
static_cast
<
float
>
(
__ldg
(
&
beta
[
tid
])));
static_cast
<
float
>
(
__ldg
(
&
beta
[
tid
])));
#else
out
[
output_bid
*
n
+
tid
]
=
(
T
)(((
local_out
-
s_mean
)
*
rsqrtf
(
s_variance
))
*
static_cast
<
float
>
(
gamma
[
tid
])
+
static_cast
<
float
>
(
beta
[
tid
]));
#endif
}
}
}
}
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
template
<
>
template
<
>
__global__
void
layernorm_shift_partition
(
half2
*
out_ptr
,
__global__
void
layernorm_shift_partition
(
half2
*
out_ptr
,
const
half2
*
input_ptr
,
const
half2
*
input_ptr
,
...
@@ -129,6 +139,7 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
...
@@ -129,6 +139,7 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
int
shift_size
,
int
shift_size
,
int
window_size
,
int
window_size
,
const
float
eps
)
{
const
float
eps
)
{
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const
int
batch_offset
=
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
;
const
int
batch_offset
=
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
;
const
int
bid
=
batch_offset
+
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
x
;
const
int
bid
=
batch_offset
+
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
x
;
const
int
shifted_H_idx
=
const
int
shifted_H_idx
=
...
@@ -185,8 +196,8 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
...
@@ -185,8 +196,8 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
(
local_out_fp2
.
y
-
s_mean
)
*
s_variance
*
gamma_val
.
y
+
beta_val
.
y
;
(
local_out_fp2
.
y
-
s_mean
)
*
s_variance
*
gamma_val
.
y
+
beta_val
.
y
;
out_ptr
[
output_bid
*
n
+
tid
]
=
__float22half2_rn
(
local_out_fp2
);
out_ptr
[
output_bid
*
n
+
tid
]
=
__float22half2_rn
(
local_out_fp2
);
}
}
}
#endif
#endif
}
#define kITE 4
#define kITE 4
template
<
typename
T
>
template
<
typename
T
>
...
@@ -233,7 +244,11 @@ __global__ void layernorm_shift_partition_v2(T *out,
...
@@ -233,7 +244,11 @@ __global__ void layernorm_shift_partition_v2(T *out,
for
(
int
i
=
0
;
i
<
kITE
;
i
++
)
{
for
(
int
i
=
0
;
i
<
kITE
;
i
++
)
{
int
col_id
=
i
*
blockDim
.
x
+
tid
;
int
col_id
=
i
*
blockDim
.
x
+
tid
;
if
(
col_id
<
n
)
{
if
(
col_id
<
n
)
{
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
local_out
[
i
]
=
static_cast
<
float
>
(
__ldg
(
input
+
offset
+
col_id
));
local_out
[
i
]
=
static_cast
<
float
>
(
__ldg
(
input
+
offset
+
col_id
));
#else
local_out
[
i
]
=
static_cast
<
float
>
(
input
[
offset
+
col_id
]);
#endif
sum
+=
local_out
[
i
];
sum
+=
local_out
[
i
];
}
}
}
}
...
@@ -265,15 +280,20 @@ __global__ void layernorm_shift_partition_v2(T *out,
...
@@ -265,15 +280,20 @@ __global__ void layernorm_shift_partition_v2(T *out,
for
(
int
i
=
0
;
i
<
kITE
;
i
++
)
{
for
(
int
i
=
0
;
i
<
kITE
;
i
++
)
{
int
col_id
=
i
*
blockDim
.
x
+
tid
;
int
col_id
=
i
*
blockDim
.
x
+
tid
;
if
(
col_id
<
n
)
{
if
(
col_id
<
n
)
{
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
out
[
output_offset
+
col_id
]
=
out
[
output_offset
+
col_id
]
=
(
T
)(
local_out
[
i
]
*
s_variance
*
(
T
)(
local_out
[
i
]
*
s_variance
*
static_cast
<
float
>
(
__ldg
(
&
gamma
[
col_id
]))
+
static_cast
<
float
>
(
__ldg
(
&
gamma
[
col_id
]))
+
static_cast
<
float
>
(
__ldg
(
&
beta
[
col_id
])));
static_cast
<
float
>
(
__ldg
(
&
beta
[
col_id
])));
#else
out
[
output_offset
+
col_id
]
=
(
T
)(
local_out
[
i
]
*
s_variance
*
static_cast
<
float
>
(
gamma
[
col_id
])
+
static_cast
<
float
>
(
beta
[
col_id
]));
#endif
}
}
}
}
}
}
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
template
<
>
template
<
>
__global__
void
layernorm_shift_partition_v2
(
half2
*
out_ptr
,
__global__
void
layernorm_shift_partition_v2
(
half2
*
out_ptr
,
const
half2
*
__restrict
input_ptr
,
const
half2
*
__restrict
input_ptr
,
...
@@ -286,6 +306,7 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
...
@@ -286,6 +306,7 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
int
shift_size
,
int
shift_size
,
int
window_size
,
int
window_size
,
const
float
eps
)
{
const
float
eps
)
{
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
// constexpr int ite = 4;
// constexpr int ite = 4;
const
int
tid
=
threadIdx
.
x
;
const
int
tid
=
threadIdx
.
x
;
const
int
batch_offset
=
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
;
const
int
batch_offset
=
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
;
...
@@ -359,8 +380,8 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
...
@@ -359,8 +380,8 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
__ldg
(
&
beta_ptr
[
col_id
]);
__ldg
(
&
beta_ptr
[
col_id
]);
}
}
}
}
}
#endif
#endif
}
template
<
typename
T
>
template
<
typename
T
>
void
invokeLayernormShiftPartition
(
T
*
out
,
void
invokeLayernormShiftPartition
(
T
*
out
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录