Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
809a10b6
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
809a10b6
编写于
1月 27, 2022
作者:
F
Feiyu Chan
提交者:
GitHub
1月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move math_cuda_utils.h to pten/kernels/funcs (#39246)
上级
3e6950d5
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
124 addition
and
95 deletion
+124
-95
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+0
-1
paddle/fluid/operators/interpolate_v2_op.cu
paddle/fluid/operators/interpolate_v2_op.cu
+10
-7
paddle/fluid/operators/math/bert_encoder_functor.cu
paddle/fluid/operators/math/bert_encoder_functor.cu
+101
-72
paddle/fluid/operators/optimizers/lars_momentum_op.cu
paddle/fluid/operators/optimizers/lars_momentum_op.cu
+9
-8
paddle/fluid/operators/softmax_cudnn_op.cu.h
paddle/fluid/operators/softmax_cudnn_op.cu.h
+0
-1
paddle/pten/kernels/funcs/math_cuda_utils.h
paddle/pten/kernels/funcs/math_cuda_utils.h
+4
-6
未找到文件。
paddle/fluid/operators/activation_op.cu
浏览文件 @
809a10b6
...
...
@@ -12,7 +12,6 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
...
...
paddle/fluid/operators/interpolate_v2_op.cu
浏览文件 @
809a10b6
...
...
@@ -12,11 +12,11 @@
#include <algorithm>
#include <string>
#include "paddle/fluid/operators/interpolate_v2_op.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -522,7 +522,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
if
(
threadIdx
.
x
<
threshold
)
{
shared_last_idx
=
(
threshold
>>
5
)
-
1
;
val
=
math
::
warpReduceMin
(
val
,
mask
);
val
=
pten
::
funcs
::
warpReduceMin
(
val
,
mask
);
if
(
lane
==
0
)
{
shared
[
wid
]
=
val
;
}
...
...
@@ -537,7 +537,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
if
(
threadIdx
.
x
<
threshold
)
{
val
=
(
lane
<=
shared_last_idx
)
?
shared
[
lane
]
:
std
::
numeric_limits
<
T
>::
max
();
val
=
math
::
warpReduceMin
(
val
,
mask
);
val
=
pten
::
funcs
::
warpReduceMin
(
val
,
mask
);
shared_last_val
=
val
;
}
__syncthreads
();
...
...
@@ -589,12 +589,15 @@ __global__ void KeBilinearInterpBwShareMemory(
s_data
[
0
][
threadIdx
.
x
]
=
0.
f
;
s_data
[
1
][
threadIdx
.
x
]
=
0.
f
;
int
remain
=
nthreads
-
(
tid
&
(
-
blockDim
.
x
));
int
in_top_max_index
=
math
::
blockReduceMax
(
top_right_index
,
FINAL_MASK
);
int
in_bot_max_index
=
math
::
blockReduceMax
(
bot_right_index
,
FINAL_MASK
);
int
in_top_max_index
=
pten
::
funcs
::
blockReduceMax
(
top_right_index
,
FINAL_MASK
);
int
in_bot_max_index
=
pten
::
funcs
::
blockReduceMax
(
bot_right_index
,
FINAL_MASK
);
if
(
remain
>
blockDim
.
x
)
{
in_top_min_index
=
math
::
blockReduceMin
(
input_index
,
FINAL_MASK
);
in_bot_min_index
=
math
::
blockReduceMin
(
bot_left_index
,
FINAL_MASK
);
in_top_min_index
=
pten
::
funcs
::
blockReduceMin
(
input_index
,
FINAL_MASK
);
in_bot_min_index
=
pten
::
funcs
::
blockReduceMin
(
bot_left_index
,
FINAL_MASK
);
}
else
{
in_top_min_index
=
PartialBlockMin
(
input_index
,
remain
,
FINAL_MASK
);
in_bot_min_index
=
PartialBlockMin
(
bot_left_index
,
remain
,
FINAL_MASK
);
...
...
paddle/fluid/operators/math/bert_encoder_functor.cu
浏览文件 @
809a10b6
...
...
@@ -18,13 +18,17 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
// NOTE(chenfeiyu): explicitly use operator+ for float2
// since float2 is not in namespace pten::funcs, ADL won't help
using
pten
::
funcs
::
operator
+
;
template
<
typename
T
>
__device__
__forceinline__
T
local_rsqrt
(
T
num
)
{
return
rsqrt
(
static_cast
<
float
>
(
num
));
...
...
@@ -34,11 +38,12 @@ __device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); }
#endif
template
<
typename
T
,
int
TPB
>
__device__
inline
void
LayerNormSmall
(
T
val
,
const
kvp
<
T
>
&
thread_data
,
__device__
inline
void
LayerNormSmall
(
T
val
,
const
pten
::
funcs
::
kvp
<
T
>
&
thread_data
,
const
int
ld
,
const
int
idx
,
const
float
*
bias
,
const
float
*
scale
,
T
*
output
,
T
eps
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
kvp
<
T
>
,
TPB
>
;
using
BlockReduce
=
cub
::
BlockReduce
<
pten
::
funcs
::
kvp
<
T
>
,
TPB
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
T
mu
;
// mean
__shared__
T
rsigma
;
// 1 / std.dev.
...
...
@@ -59,10 +64,11 @@ __device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data,
}
template
<
typename
T
,
int
TPB
>
__device__
inline
void
LayerNorm
(
const
kvp
<
T
>
&
thread_data
,
const
int
ld
,
const
int
offset
,
const
float
*
bias
,
const
float
*
scale
,
T
*
output
,
T
eps
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
kvp
<
T
>
,
TPB
>
;
__device__
inline
void
LayerNorm
(
const
pten
::
funcs
::
kvp
<
T
>
&
thread_data
,
const
int
ld
,
const
int
offset
,
const
float
*
bias
,
const
float
*
scale
,
T
*
output
,
T
eps
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
pten
::
funcs
::
kvp
<
T
>
,
TPB
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
T
mu
;
// mean
__shared__
T
rsigma
;
// 1 / std.dev.
...
...
@@ -85,10 +91,11 @@ __device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld,
}
template
<
typename
T
,
typename
T2
,
int
TPB
>
__device__
inline
void
LayerNorm2
(
const
kvp
<
T
>
&
thread_data
,
const
int
ld
,
const
int
offset
,
const
float2
*
bias
,
const
float2
*
scale
,
T2
*
output
,
T
eps
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
kvp
<
T
>
,
TPB
>
;
__device__
inline
void
LayerNorm2
(
const
pten
::
funcs
::
kvp
<
T
>
&
thread_data
,
const
int
ld
,
const
int
offset
,
const
float2
*
bias
,
const
float2
*
scale
,
T2
*
output
,
T
eps
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
pten
::
funcs
::
kvp
<
T
>
,
TPB
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
T
mu
;
// mean
__shared__
T
rsigma
;
// 1 / std.dev.
...
...
@@ -137,7 +144,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
const
int64_t
out_offset
=
seq_pos
*
hidden
;
kvp
<
T
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
T
>
thread_data
(
0
,
0
);
#pragma unroll
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
TPB
)
{
...
...
@@ -148,7 +155,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
output
[
out_offset
+
it
]
=
val
;
const
T
rhiddenval
=
rhidden
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
T
>
(
rhiddenval
,
rhiddenval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
T
>
(
rhiddenval
,
rhiddenval
*
val
));
}
LayerNorm
<
T
,
TPB
>
(
thread_data
,
hidden
,
out_offset
,
bias
,
scale
,
output
,
eps
);
}
...
...
@@ -180,7 +188,7 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(
const
int64_t
out_offset
=
seq_pos
*
hidden
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
#pragma unroll
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
256
)
{
...
...
@@ -191,8 +199,8 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(
output
[
out_offset
+
it
]
=
val
;
const
half
rhiddenval
=
rhidden
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rhiddenval
,
rhiddenval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rhiddenval
,
rhiddenval
*
val
));
}
LayerNorm
<
half
,
256
>
(
thread_data
,
hidden
,
out_offset
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -233,10 +241,10 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_,
?
static_cast
<
float
>
(
qk_buf_
[
threadIdx
.
x
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
qk_offset
])
:
-
1e20
f
;
float
max_val
=
blockReduceMax
<
float
>
(
tmp
,
mask
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
tmp
,
mask
);
float
qk_tmp
=
threadIdx
.
x
<
seq_len
?
__expf
(
tmp
-
max_val
)
:
0.0
f
;
float
sum_val
=
blockReduceSum
<
float
>
(
qk_tmp
,
mask
);
float
sum_val
=
pten
::
funcs
::
blockReduceSum
<
float
>
(
qk_tmp
,
mask
);
if
(
threadIdx
.
x
<
seq_len
)
qk_buf_
[
threadIdx
.
x
+
qk_offset
]
=
(
T
)(
qk_tmp
/
sum_val
);
...
...
@@ -256,10 +264,10 @@ __global__ void SoftmaxKernelWithEltadd<half>(
?
static_cast
<
float
>
(
qk_buf_
[
threadIdx
.
x
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
qk_offset
])
:
-
1e20
f
;
float
max_val
=
blockReduceMax
<
float
>
(
tmp
,
mask
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
tmp
,
mask
);
float
qk_tmp
=
threadIdx
.
x
<
seq_len
?
__expf
(
tmp
-
max_val
)
:
0.0
f
;
float
sum_val
=
blockReduceSum
<
float
>
(
qk_tmp
,
mask
);
float
sum_val
=
pten
::
funcs
::
blockReduceSum
<
float
>
(
qk_tmp
,
mask
);
if
(
threadIdx
.
x
<
seq_len
)
qk_buf_
[
threadIdx
.
x
+
qk_offset
]
=
(
half
)(
qk_tmp
/
sum_val
);
...
...
@@ -276,19 +284,20 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
int
idx
=
threadIdx
.
x
;
assert
(
blockDim
.
x
%
32
==
0
);
float2
tmp
=
idx
<
seq_len
?
ToFloat2
<
T
>
(
qk_buf_
[
idx
+
qk_offset
]
+
bias_qk_
[
idx
+
qk_offset
])
:
make_float2
(
-
1e20
f
,
-
1e20
f
);
float
max_val
=
blockReduceMax
<
float
>
(
max
(
tmp
.
x
,
tmp
.
y
),
mask
);
float2
tmp
=
idx
<
seq_len
?
pten
::
funcs
::
ToFloat2
<
T
>
(
qk_buf_
[
idx
+
qk_offset
]
+
bias_qk_
[
idx
+
qk_offset
])
:
make_float2
(
-
1e20
f
,
-
1e20
f
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
max
(
tmp
.
x
,
tmp
.
y
),
mask
);
float2
qk_tmp
=
idx
<
seq_len
?
make_float2
(
__expf
(
tmp
.
x
-
max_val
),
__expf
(
tmp
.
y
-
max_val
))
:
make_float2
(
0.
f
,
0.
f
);
float
sum_val
=
blockReduceSum
<
float
>
(
qk_tmp
.
x
+
qk_tmp
.
y
,
mask
)
+
1e-6
f
;
float
sum_val
=
pten
::
funcs
::
blockReduceSum
<
float
>
(
qk_tmp
.
x
+
qk_tmp
.
y
,
mask
)
+
1e-6
f
;
if
(
idx
<
seq_len
)
{
qk_buf_
[
idx
+
qk_offset
]
=
FloatsToPair
<
T
>
(
qk_tmp
.
x
/
sum_val
,
qk_tmp
.
y
/
sum_val
);
pten
::
funcs
::
FloatsToPair
<
T
>
(
qk_tmp
.
x
/
sum_val
,
qk_tmp
.
y
/
sum_val
);
}
}
...
...
@@ -304,18 +313,20 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
int
idx
=
threadIdx
.
x
;
assert
(
blockDim
.
x
%
32
==
0
);
float2
tmp
=
idx
<
seq_len
?
ToFloat2
<
half2
>
(
qk_buf_
[
idx
+
qk_offset
]
+
bias_qk_
[
idx
+
qk_offset
])
:
make_float2
(
-
1e20
f
,
-
1e20
f
);
float
max_val
=
blockReduceMax
<
float
>
(
max
(
tmp
.
x
,
tmp
.
y
),
mask
);
float2
tmp
=
idx
<
seq_len
?
pten
::
funcs
::
ToFloat2
<
half2
>
(
qk_buf_
[
idx
+
qk_offset
]
+
bias_qk_
[
idx
+
qk_offset
])
:
make_float2
(
-
1e20
f
,
-
1e20
f
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
max
(
tmp
.
x
,
tmp
.
y
),
mask
);
float2
qk_tmp
=
idx
<
seq_len
?
make_float2
(
__expf
(
tmp
.
x
-
max_val
),
__expf
(
tmp
.
y
-
max_val
))
:
make_float2
(
0.
f
,
0.
f
);
float
sum_val
=
blockReduceSum
<
float
>
(
qk_tmp
.
x
+
qk_tmp
.
y
,
mask
)
+
1e-6
f
;
float
sum_val
=
pten
::
funcs
::
blockReduceSum
<
float
>
(
qk_tmp
.
x
+
qk_tmp
.
y
,
mask
)
+
1e-6
f
;
if
(
idx
<
seq_len
)
{
qk_buf_
[
idx
+
qk_offset
]
=
FloatsToPair
<
half2
>
(
qk_tmp
.
x
/
sum_val
,
qk_tmp
.
y
/
sum_val
);
qk_buf_
[
idx
+
qk_offset
]
=
pten
::
funcs
::
FloatsToPair
<
half2
>
(
qk_tmp
.
x
/
sum_val
,
qk_tmp
.
y
/
sum_val
);
}
#endif
}
...
...
@@ -338,14 +349,14 @@ __global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
bias_qk
[
threadIdx
.
x
+
i
+
qk_offset
]
:
stride_max
;
}
T
max_val
=
blockReduceMax
<
T
>
(
stride_max
,
mask
);
T
max_val
=
pten
::
funcs
::
blockReduceMax
<
T
>
(
stride_max
,
mask
);
T
stride_sum
=
0.
f
;
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
stride_sum
+=
__expf
(
qk_buf
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk
[
threadIdx
.
x
+
i
+
qk_offset
]
-
max_val
);
}
T
sum_val
=
blockReduceSum
<
T
>
(
stride_sum
,
mask
);
T
sum_val
=
pten
::
funcs
::
blockReduceSum
<
T
>
(
stride_sum
,
mask
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
qk_buf
[
threadIdx
.
x
+
i
+
qk_offset
]
=
...
...
@@ -371,7 +382,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
bias_qk
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_max
=
tmp
>
stride_max
?
tmp
:
stride_max
;
}
float
max_val
=
blockReduceMax
<
float
>
(
stride_max
,
mask
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
stride_max
,
mask
);
float
stride_sum
=
0.
f
;
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
...
...
@@ -379,7 +390,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
bias_qk
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_sum
+=
__expf
(
tmp
-
max_val
);
}
float
sum_val
=
blockReduceSum
<
float
>
(
stride_sum
,
mask
);
float
sum_val
=
pten
::
funcs
::
blockReduceSum
<
float
>
(
stride_sum
,
mask
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float
tmp
=
...
...
@@ -403,28 +414,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,
float2
stride_max
=
make_float2
(
-
1e20
f
,
-
1e20
f
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
float2
cur
=
pten
::
funcs
::
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_max
.
x
=
max
(
stride_max
.
x
,
cur
.
x
);
stride_max
.
y
=
max
(
stride_max
.
y
,
cur
.
y
);
}
float
max_val
=
blockReduceMax
<
float
>
(
max
(
stride_max
.
x
,
stride_max
.
y
),
mask
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
max
(
stride_max
.
x
,
stride_max
.
y
),
mask
);
float2
stride_sum
=
make_float2
(
0.
f
,
0.
f
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
float2
cur
=
pten
::
funcs
::
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_sum
.
x
+=
__expf
(
cur
.
x
-
max_val
);
stride_sum
.
y
+=
__expf
(
cur
.
y
-
max_val
);
}
float
sum_val
=
blockReduceSum
<
float
>
(
stride_sum
.
x
+
stride_sum
.
y
,
mask
)
+
1e-6
f
;
pten
::
funcs
::
blockReduceSum
<
float
>
(
stride_sum
.
x
+
stride_sum
.
y
,
mask
)
+
1e-6
f
;
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
=
FloatsToPair
<
T
>
(
float2
cur
=
pten
::
funcs
::
ToFloat2
<
T
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
=
pten
::
funcs
::
FloatsToPair
<
T
>
(
__expf
(
cur
.
x
-
max_val
)
/
sum_val
,
__expf
(
cur
.
y
-
max_val
)
/
sum_val
);
}
}
...
...
@@ -443,28 +459,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(
float2
stride_max
=
make_float2
(
-
1e20
f
,
-
1e20
f
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
float2
cur
=
pten
::
funcs
::
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_max
.
x
=
max
(
stride_max
.
x
,
cur
.
x
);
stride_max
.
y
=
max
(
stride_max
.
y
,
cur
.
y
);
}
float
max_val
=
blockReduceMax
<
float
>
(
max
(
stride_max
.
x
,
stride_max
.
y
),
mask
);
float
max_val
=
pten
::
funcs
::
blockReduceMax
<
float
>
(
max
(
stride_max
.
x
,
stride_max
.
y
),
mask
);
float2
stride_sum
=
make_float2
(
0.
f
,
0.
f
);
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
float2
cur
=
pten
::
funcs
::
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
stride_sum
.
x
+=
__expf
(
cur
.
x
-
max_val
);
stride_sum
.
y
+=
__expf
(
cur
.
y
-
max_val
);
}
float
sum_val
=
blockReduceSum
<
float
>
(
stride_sum
.
x
+
stride_sum
.
y
,
mask
)
+
1e-6
f
;
pten
::
funcs
::
blockReduceSum
<
float
>
(
stride_sum
.
x
+
stride_sum
.
y
,
mask
)
+
1e-6
f
;
for
(
int
i
=
0
;
i
<
seq_len
;
i
+=
blockDim
.
x
)
{
float2
cur
=
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
=
FloatsToPair
<
half2
>
(
float2
cur
=
pten
::
funcs
::
ToFloat2
<
half2
>
(
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
+
bias_qk_
[
threadIdx
.
x
+
i
+
qk_offset
]);
qk_buf_
[
threadIdx
.
x
+
i
+
qk_offset
]
=
pten
::
funcs
::
FloatsToPair
<
half2
>
(
__expf
(
cur
.
x
-
max_val
)
/
sum_val
,
__expf
(
cur
.
y
-
max_val
)
/
sum_val
);
}
#endif
...
...
@@ -595,13 +616,14 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1,
const
T
rld
=
T
(
1
)
/
T
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
T
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
T
>
thread_data
(
0
,
0
);
const
int
idx
=
offset
+
threadIdx
.
x
;
T
val
=
0
;
if
(
threadIdx
.
x
<
hidden
)
{
val
=
input1
[
idx
]
+
input2
[
idx
];
const
T
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
T
>
(
rldval
,
rldval
*
val
));
}
LayerNormSmall
<
T
,
TPB
>
(
val
,
thread_data
,
hidden
,
idx
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -617,13 +639,14 @@ __global__ void SkipLayerNormSmallKernel<half, 32>(
const
half
rld
=
half
(
1
)
/
half
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
const
int
idx
=
offset
+
threadIdx
.
x
;
half
val
=
0
;
if
(
threadIdx
.
x
<
hidden
)
{
val
=
input1
[
idx
]
+
input2
[
idx
];
const
half
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rldval
,
rldval
*
val
));
}
LayerNormSmall
<
half
,
32
>
(
val
,
thread_data
,
hidden
,
idx
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -638,13 +661,14 @@ __global__ void SkipLayerNormSmallKernel<half, 128>(
const
half
rld
=
half
(
1
)
/
half
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
const
int
idx
=
offset
+
threadIdx
.
x
;
half
val
=
0
;
if
(
threadIdx
.
x
<
hidden
)
{
val
=
input1
[
idx
]
+
input2
[
idx
];
const
half
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rldval
,
rldval
*
val
));
}
LayerNormSmall
<
half
,
128
>
(
val
,
thread_data
,
hidden
,
idx
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -659,13 +683,14 @@ __global__ void SkipLayerNormSmallKernel<half, 384>(
const
half
rld
=
half
(
1
)
/
half
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
const
int
idx
=
offset
+
threadIdx
.
x
;
half
val
=
0
;
if
(
threadIdx
.
x
<
hidden
)
{
val
=
input1
[
idx
]
+
input2
[
idx
];
const
half
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rldval
,
rldval
*
val
));
}
LayerNormSmall
<
half
,
384
>
(
val
,
thread_data
,
hidden
,
idx
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -681,13 +706,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
const
T
rld
=
T
(
1
)
/
T
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
T
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
T
>
thread_data
(
0
,
0
);
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
TPB
)
{
const
int
idx
=
offset
+
it
;
const
T
val
=
input1
[
idx
]
+
input2
[
idx
];
const
T
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
T
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
T
>
(
rldval
,
rldval
*
val
));
output
[
idx
]
=
val
;
}
LayerNorm
<
T
,
TPB
>
(
thread_data
,
hidden
,
offset
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -705,13 +731,14 @@ __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
const
half
rld
=
half
(
1
)
/
half
(
hidden
);
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
256
)
{
const
int
idx
=
offset
+
it
;
const
half
val
=
input1
[
idx
]
+
input2
[
idx
];
const
half
rldval
=
rld
*
val
;
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rldval
,
rldval
*
val
));
thread_data
=
pair_sum
(
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rldval
,
rldval
*
val
));
output
[
idx
]
=
val
;
}
LayerNorm
<
half
,
256
>
(
thread_data
,
hidden
,
offset
,
bias
,
scale
,
output
,
eps
);
...
...
@@ -727,13 +754,14 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
const
T
rld
=
T
(
0.5
f
/
hidden
);
// because hidden is hidden/2
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
T
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
T
>
thread_data
(
0
,
0
);
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
TPB
)
{
const
int
idx
=
offset
+
it
;
const
T2
val2
=
input1
[
idx
]
+
input2
[
idx
];
thread_data
=
pair_sum
(
thread_data
,
kvp
<
T
>
(
rld
*
(
val2
.
x
+
val2
.
y
),
thread_data
,
pten
::
funcs
::
kvp
<
T
>
(
rld
*
(
val2
.
x
+
val2
.
y
),
rld
*
val2
.
x
*
val2
.
x
+
rld
*
val2
.
y
*
val2
.
y
));
output
[
idx
]
=
val2
;
}
...
...
@@ -751,13 +779,14 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>(
const
half
rld
=
half
(
0.5
f
/
hidden
);
// because hidden is hidden/2
const
int
offset
=
blockIdx
.
x
*
hidden
;
cub
::
Sum
pair_sum
;
kvp
<
half
>
thread_data
(
0
,
0
);
pten
::
funcs
::
kvp
<
half
>
thread_data
(
0
,
0
);
for
(
int
it
=
threadIdx
.
x
;
it
<
hidden
;
it
+=
256
)
{
const
int
idx
=
offset
+
it
;
const
half2
val2
=
input1
[
idx
]
+
input2
[
idx
];
thread_data
=
pair_sum
(
thread_data
,
kvp
<
half
>
(
rld
*
(
val2
.
x
+
val2
.
y
),
thread_data
,
pten
::
funcs
::
kvp
<
half
>
(
rld
*
(
val2
.
x
+
val2
.
y
),
rld
*
val2
.
x
*
val2
.
x
+
rld
*
val2
.
y
*
val2
.
y
));
output
[
idx
]
=
val2
;
}
...
...
paddle/fluid/operators/optimizers/lars_momentum_op.cu
浏览文件 @
809a10b6
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
#if CUDA_VERSION >= 11000
#include <cooperative_groups.h>
...
...
@@ -170,8 +170,8 @@ __global__ void L2NormKernel(
g_tmp
+=
(
tmp1
*
tmp1
);
tid
+=
grid_stride
;
}
p_tmp
=
math
::
blockReduceSum
<
MT
>
(
p_tmp
,
FINAL_MASK
);
g_tmp
=
math
::
blockReduceSum
<
MT
>
(
g_tmp
,
FINAL_MASK
);
p_tmp
=
pten
::
funcs
::
blockReduceSum
<
MT
>
(
p_tmp
,
FINAL_MASK
);
g_tmp
=
pten
::
funcs
::
blockReduceSum
<
MT
>
(
g_tmp
,
FINAL_MASK
);
if
(
threadIdx
.
x
==
0
)
{
p_buffer
[
blockIdx
.
x
]
=
p_tmp
;
...
...
@@ -181,8 +181,8 @@ __global__ void L2NormKernel(
cg
->
sync
();
// Grid sync for writring partial result to gloabl memory
MT
p_part_sum
=
threadIdx
.
x
<
gridDim
.
x
?
p_buffer
[
threadIdx
.
x
]
:
0
;
MT
g_part_sum
=
threadIdx
.
x
<
gridDim
.
x
?
g_buffer
[
threadIdx
.
x
]
:
0
;
MT
tmp0
=
math
::
blockReduceSum
<
MT
>
(
p_part_sum
,
FINAL_MASK
);
MT
tmp1
=
math
::
blockReduceSum
<
MT
>
(
g_part_sum
,
FINAL_MASK
);
MT
tmp0
=
pten
::
funcs
::
blockReduceSum
<
MT
>
(
p_part_sum
,
FINAL_MASK
);
MT
tmp1
=
pten
::
funcs
::
blockReduceSum
<
MT
>
(
g_part_sum
,
FINAL_MASK
);
if
(
threadIdx
.
x
==
0
)
{
s_buffer
[
0
]
=
tmp0
;
s_buffer
[
1
]
=
tmp1
;
...
...
@@ -294,9 +294,10 @@ __global__ void MomentumLarsKernel(
MT
param_part_norm
=
threadIdx
.
x
<
thresh
?
p_buffer
[
threadIdx
.
x
]
:
0
;
MT
grad_part_norm
=
threadIdx
.
x
<
thresh
?
g_buffer
[
threadIdx
.
x
]
:
0
;
__syncthreads
();
MT
param_norm
=
Sqrt
(
math
::
blockReduceSum
<
MT
>
(
param_part_norm
,
FINAL_MASK
));
MT
grad_norm
=
Sqrt
(
rescale_grad_pow
*
math
::
blockReduceSum
<
MT
>
(
grad_part_norm
,
FINAL_MASK
));
MT
param_norm
=
Sqrt
(
pten
::
funcs
::
blockReduceSum
<
MT
>
(
param_part_norm
,
FINAL_MASK
));
MT
grad_norm
=
Sqrt
(
rescale_grad_pow
*
pten
::
funcs
::
blockReduceSum
<
MT
>
(
grad_part_norm
,
FINAL_MASK
));
#endif
MomentumUpdate
<
T
,
MT
>
(
param
,
grad
,
velocity
,
param_out
,
velocity_out
,
master_param
,
master_param_out
,
learning_rate
,
mu
,
...
...
paddle/fluid/operators/softmax_cudnn_op.cu.h
浏览文件 @
809a10b6
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
...
...
paddle/
fluid/operators/math
/math_cuda_utils.h
→
paddle/
pten/kernels/funcs
/math_cuda_utils.h
浏览文件 @
809a10b6
...
...
@@ -23,9 +23,8 @@ limitations under the License. */
#include <algorithm>
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
pten
{
namespace
funcs
{
template
<
typename
T
>
__device__
__forceinline__
T
FromFloat
(
float
a
);
...
...
@@ -315,6 +314,5 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) {
return
val
;
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
}
// namespace funcs
}
// namespace pten
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录