Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
38edea9a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
38edea9a
编写于
9月 09, 2022
作者:
S
sneaxiy
提交者:
GitHub
9月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix softmax op when the input shape is larger than INT32_MAX (#45897)
* fix softmax int64 * follow comments
上级
bd8f998b
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
100 addition
and
76 deletion
+100
-76
paddle/phi/kernels/funcs/axis_utils.h
paddle/phi/kernels/funcs/axis_utils.h
+7
-4
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+93
-72
未找到文件。
paddle/phi/kernels/funcs/axis_utils.h
浏览文件 @
38edea9a
...
...
@@ -26,24 +26,27 @@ static inline int CanonicalAxis(const int axis, const int rank) {
return
axis
;
}
static
inline
int
SizeToAxis
(
const
int
axis
,
DDim
dims
)
{
int
size
=
1
;
template
<
typename
T
=
int
>
static
inline
T
SizeToAxis
(
const
int
axis
,
DDim
dims
)
{
T
size
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
size
*=
dims
[
i
];
}
return
size
;
}
template
<
typename
T
=
int
>
static
inline
int
SizeFromAxis
(
const
int
axis
,
DDim
dims
)
{
int
size
=
1
;
T
size
=
1
;
for
(
int
i
=
axis
;
i
<
dims
.
size
();
i
++
)
{
size
*=
dims
[
i
];
}
return
size
;
}
template
<
typename
T
=
int
>
static
inline
int
SizeOutAxis
(
const
int
axis
,
DDim
dims
)
{
int
size
=
1
;
T
size
=
1
;
for
(
int
i
=
axis
+
1
;
i
<
dims
.
size
();
i
++
)
{
size
*=
dims
[
i
];
}
...
...
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
浏览文件 @
38edea9a
...
...
@@ -258,30 +258,33 @@ api to compute max (sum) in one warp.
template
<
typename
T
,
typename
VecT
,
typename
AccT
,
typename
IndexType
,
int
Log2Elements
,
bool
LogMode
=
false
>
__global__
void
WarpSoftmaxForward
(
T
*
softmax
,
const
T
*
src
,
const
int
batch_size
,
const
int
stride
,
const
int
element_count
)
{
constexpr
int
kDimCeil
=
1
<<
Log2Elements
;
constexpr
int
kWarpSize
=
(
kDimCeil
<
32
)
?
kDimCeil
:
32
;
constexpr
int
kVSize
=
sizeof
(
VecT
)
/
sizeof
(
T
);
constexpr
int
kLoops
=
kDimCeil
/
kWarpSize
;
constexpr
int
kLoopsV
=
(
kLoops
>=
kVSize
)
?
(
kLoops
/
kVSize
)
:
1
;
constexpr
int
kBatchSize
=
(
kDimCeil
<=
32
)
?
2
:
1
;
int
first_batch
=
(
blockDim
.
y
*
blockIdx
.
x
+
threadIdx
.
y
)
*
kBatchSize
;
constexpr
int
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
int
kVItem
=
kLoopsV
*
kVSize
;
const
IndexType
batch_size
,
const
IndexType
stride
,
const
IndexType
element_count
)
{
constexpr
IndexType
kDimCeil
=
1
<<
Log2Elements
;
constexpr
IndexType
kWarpSize
=
(
kDimCeil
<
32
)
?
kDimCeil
:
32
;
constexpr
IndexType
kVSize
=
sizeof
(
VecT
)
/
sizeof
(
T
);
constexpr
IndexType
kLoops
=
kDimCeil
/
kWarpSize
;
constexpr
IndexType
kLoopsV
=
(
kLoops
>=
kVSize
)
?
(
kLoops
/
kVSize
)
:
1
;
constexpr
IndexType
kBatchSize
=
(
kDimCeil
<=
32
)
?
2
:
1
;
IndexType
first_batch
=
(
static_cast
<
IndexType
>
(
blockDim
.
y
)
*
blockIdx
.
x
+
threadIdx
.
y
)
*
kBatchSize
;
constexpr
IndexType
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
IndexType
kVItem
=
kLoopsV
*
kVSize
;
constexpr
AccT
kLowInf
=
-
std
::
numeric_limits
<
AccT
>::
infinity
();
using
kMode
=
kps
::
details
::
ReduceMode
;
// max index to read
int
idx_max_v
[
kBatchSize
];
IndexType
idx_max_v
[
kBatchSize
];
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
i
++
)
{
int
idx_max
=
((
i
+
first_batch
)
<
batch_size
)
?
element_count
:
0
;
for
(
IndexType
i
=
0
;
i
<
kBatchSize
;
i
++
)
{
IndexType
idx_max
=
((
i
+
first_batch
)
<
batch_size
)
?
element_count
:
0
;
idx_max_v
[
i
]
=
idx_max
/
kVSize
;
}
...
...
@@ -307,7 +310,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
// read data from global memory
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
for
(
IndexType
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
src_data
[
i
][
0
][
0
]);
...
...
@@ -328,7 +331,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
// compute sum
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
for
(
IndexType
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
max
[
i
]));
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpFunctor
<
AccT
>>
(
...
...
@@ -344,7 +347,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
// write data to global memory
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
for
(
IndexType
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
VecT
*
softmax_v
=
reinterpret_cast
<
VecT
*>
(
&
softmax
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
...
...
@@ -489,26 +492,26 @@ __global__ void WarpSoftmaxBackward(T* dst,
}
}
#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT) \
case Log2Elements: \
WarpSoftmaxForward<T, VecT, AccT, Log2Elements, LogMode> \
<<<blocks, threads, 0, dev_ctx.stream()>>>( \
dst, src, batch_size, stride, element_count); \
#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)
\
case Log2Elements:
\
WarpSoftmaxForward<T, VecT, AccT,
IndexType,
Log2Elements, LogMode> \
<<<blocks, threads, 0, dev_ctx.stream()>>>(
\
dst, src, batch_size, stride, element_count);
\
break;
/*
Wrapper of softmax formward with template instantiation on size of input.
*/
template
<
typename
T
,
typename
VecT
,
bool
LogMode
>
void
SwitchWarpSoftmaxForward
(
const
int
blocks
,
template
<
typename
T
,
typename
VecT
,
typename
IndexType
,
bool
LogMode
>
void
SwitchWarpSoftmaxForward
(
const
IndexType
blocks
,
const
dim3
threads
,
const
GPUContext
&
dev_ctx
,
T
*
dst
,
const
T
*
src
,
const
int
batch_size
,
const
int
stride
,
const
int
element_count
,
int
Log2Elements
)
{
const
IndexType
batch_size
,
const
IndexType
stride
,
const
IndexType
element_count
,
IndexType
Log2Elements
)
{
using
AccT
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
switch
(
Log2Elements
)
{
SOFTMAX_WARP_FORWARD_CASE
(
0
,
AccT
);
...
...
@@ -758,11 +761,12 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
}
}
static
std
::
vector
<
int
>
GetSoftmaxTensorDims
(
const
phi
::
DDim
&
dims
,
const
int
axis
)
{
int
dim
=
dims
[
axis
];
int
N
=
phi
::
funcs
::
SizeToAxis
(
axis
,
dims
);
int
D
=
phi
::
funcs
::
SizeOutAxis
(
axis
,
dims
);
template
<
typename
T
=
int
>
static
std
::
vector
<
T
>
GetSoftmaxTensorDims
(
const
phi
::
DDim
&
dims
,
const
int
axis
)
{
auto
dim
=
static_cast
<
T
>
(
dims
[
axis
]);
auto
N
=
phi
::
funcs
::
SizeToAxis
<
T
>
(
axis
,
dims
);
auto
D
=
phi
::
funcs
::
SizeOutAxis
<
T
>
(
axis
,
dims
);
return
{
N
,
dim
,
D
,
1
};
}
...
...
@@ -950,7 +954,9 @@ inline void LaunchSoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
#endif
template
<
typename
T
>
bool
UseCudnnSoftmax
(
const
GPUContext
&
ctx
,
int
softmax_dim
,
bool
last_dim
)
{
bool
UseCudnnSoftmax
(
const
GPUContext
&
ctx
,
int64_t
softmax_dim
,
bool
last_dim
)
{
bool
cudnn_available
=
ctx
.
cudnn_handle
();
if
(
!
ctx
.
cudnn_handle
())
{
if
(
std
::
is_same
<
T
,
phi
::
dtype
::
bfloat16
>::
value
)
{
...
...
@@ -968,24 +974,25 @@ bool UseCudnnSoftmax(const GPUContext& ctx, int softmax_dim, bool last_dim) {
}
}
template
<
typename
T
,
bool
LogMode
=
false
>
void
SoftmaxForwardCUDAKernelDriver
(
const
GPUContext
&
dev_ctx
,
const
DenseTensor
&
x
,
const
int
input_axis
,
DenseTensor
*
out
)
{
template
<
typename
T
,
typename
IndexType
,
bool
LogMode
=
false
>
void
SoftmaxForwardCUDAKernelDriver
Impl
(
const
GPUContext
&
dev_ctx
,
const
DenseTensor
&
x
,
const
int
input_axis
,
DenseTensor
*
out
)
{
auto
*
out_data
=
out
->
data
<
T
>
();
int
rank
=
x
.
dims
().
size
();
int
axis
=
phi
::
funcs
::
CanonicalAxis
(
input_axis
,
rank
);
std
::
vector
<
int
>
tensor_dims
=
GetSoftmaxTensorDims
(
x
.
dims
(),
axis
);
int
N
=
tensor_dims
[
0
];
int
dim
=
tensor_dims
[
1
];
std
::
vector
<
IndexType
>
tensor_dims
=
GetSoftmaxTensorDims
<
IndexType
>
(
x
.
dims
(),
axis
);
IndexType
N
=
tensor_dims
[
0
];
IndexType
dim
=
tensor_dims
[
1
];
int
D
=
tensor_dims
[
2
];
if
(
D
==
1
)
{
if
(
!
UseCudnnSoftmax
<
T
>
(
dev_ctx
,
dim
,
true
))
{
int
dim_log2
=
static_cast
<
int
>
(
Log2Ceil
(
dim
));
int
dim_ceil
=
1
<<
dim_log2
;
IndexType
dim_ceil
=
1
<<
dim_log2
;
int
warp_size
=
(
dim_ceil
<
32
)
?
dim_ceil
:
32
;
int
batches_per_warp
=
(
dim_ceil
<=
32
)
?
2
:
1
;
...
...
@@ -994,7 +1001,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
int
warps_per_block
=
(
threads_per_block
/
warp_size
);
int
batches_per_block
=
warps_per_block
*
batches_per_warp
;
int
blocks
=
(
N
+
batches_per_block
-
1
)
/
batches_per_block
;
IndexType
blocks
=
(
N
+
batches_per_block
-
1
)
/
batches_per_block
;
dim3
threads
(
warp_size
,
warps_per_block
,
1
);
// vectorization read/write
...
...
@@ -1002,35 +1009,35 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
using
T2
=
typename
VecT2
<
T
>::
Type
;
if
(
dim
%
4
==
0
)
{
SwitchWarpSoftmaxForward
<
T
,
T4
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
SwitchWarpSoftmaxForward
<
T
,
T4
,
IndexType
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
}
else
if
(
dim
%
2
==
0
)
{
SwitchWarpSoftmaxForward
<
T
,
T2
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
SwitchWarpSoftmaxForward
<
T
,
T2
,
IndexType
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
}
else
{
SwitchWarpSoftmaxForward
<
T
,
T
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
SwitchWarpSoftmaxForward
<
T
,
T
,
IndexType
,
LogMode
>
(
blocks
,
threads
,
dev_ctx
,
out_data
,
x
.
data
<
T
>
(),
N
,
dim
,
dim
,
dim_log2
);
}
}
else
{
LaunchSoftmaxForwardCudnnKernel
<
T
>
(
dev_ctx
,
x
,
axis
,
LogMode
,
out
);
...
...
@@ -1041,6 +1048,20 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
}
}
template
<
typename
T
,
bool
LogMode
=
false
>
void
SoftmaxForwardCUDAKernelDriver
(
const
GPUContext
&
dev_ctx
,
const
DenseTensor
&
x
,
const
int
input_axis
,
DenseTensor
*
out
)
{
if
(
x
.
numel
()
>=
std
::
numeric_limits
<
int32_t
>::
max
())
{
SoftmaxForwardCUDAKernelDriverImpl
<
T
,
int64_t
,
LogMode
>
(
dev_ctx
,
x
,
input_axis
,
out
);
}
else
{
SoftmaxForwardCUDAKernelDriverImpl
<
T
,
int32_t
,
LogMode
>
(
dev_ctx
,
x
,
input_axis
,
out
);
}
}
template
<
typename
T
,
bool
LogMode
=
false
>
void
SoftmaxBackwardCUDAKernelDriver
(
const
GPUContext
&
dev_ctx
,
const
DenseTensor
&
out
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录