Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5dc0a6eb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5dc0a6eb
编写于
4月 14, 2021
作者:
A
AshburnLee
提交者:
GitHub
4月 14, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize of backward of log_softmax when axis is -1 and dim_size <= 1024 (#32180)
上级
7da4455f
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
126 addition
and
6 deletion
+126
-6
paddle/fluid/operators/log_softmax_op.cu
paddle/fluid/operators/log_softmax_op.cu
+126
-6
未找到文件。
paddle/fluid/operators/log_softmax_op.cu
浏览文件 @
5dc0a6eb
...
...
@@ -65,11 +65,6 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
constexpr
int
warp_iter
=
near_greater_power_of_two
/
kernel_warp_size
;
int
batch_id
=
blockDim
.
y
*
blockIdx
.
x
+
threadIdx
.
y
;
// set effective_warp_id as 1 when warps do effective work,
// when warps do ineffective work, effective_warp_id remains unchanged.
int
effective_warp_id
=
batch_size
-
batch_id
;
if
(
effective_warp_id
>
1
)
effective_warp_id
=
1
;
int
thread_in_warp_idx
=
threadIdx
.
x
;
// 1.read data from global memory to registers
...
...
@@ -77,7 +72,7 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
// set effective_element_count as the num of elements when warps do effective
// work
// set effective_element_count as 0, when warps do ineffective work
int
effective_element_count
=
(
effective_warp_id
<=
0
)
?
0
:
element_count
;
int
effective_element_count
=
(
batch_id
<
batch_size
)
?
element_count
:
0
;
for
(
int
it
=
0
;
it
<
warp_iter
;
++
it
)
{
int
element_index
=
thread_in_warp_idx
+
it
*
kernel_warp_size
;
if
(
element_index
<
effective_element_count
)
{
...
...
@@ -181,6 +176,131 @@ class LogSoftmaxKernel<platform::CUDADeviceContext, T>
}
};
// Backward below
#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two) \
case near_greater_power_of_two: \
ComputeLogSoftmaxBackwardInWarp< \
T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
output, grad_output, grad_input, outer_size, dim_size); \
break;
template
<
typename
T
,
typename
AccT
,
int
NearGreaterPowerOfTwo
>
__global__
void
ComputeLogSoftmaxBackwardInWarp
(
const
T
*
output
,
const
T
*
grad_output
,
T
*
grad_input
,
int
batch_size
,
int
element_count
)
{
constexpr
int
near_greater_power_of_two
=
NearGreaterPowerOfTwo
;
constexpr
int
kernel_warp_size
=
(
near_greater_power_of_two
<
32
)
?
near_greater_power_of_two
:
32
;
constexpr
int
warp_iter
=
near_greater_power_of_two
/
kernel_warp_size
;
int
batch_id
=
blockDim
.
y
*
blockIdx
.
x
+
threadIdx
.
y
;
int
thread_in_warp_idx
=
threadIdx
.
x
%
kernel_warp_size
;
// 1.read data from global memory to registers
AccT
output_register
[
warp_iter
];
AccT
grad_output_register
[
warp_iter
];
int
effective_element_count
=
(
batch_id
<
batch_size
)
?
element_count
:
0
;
for
(
int
iter
=
0
;
iter
<
warp_iter
;
++
iter
)
{
int
element_index
=
thread_in_warp_idx
+
iter
*
kernel_warp_size
;
if
(
element_index
<
effective_element_count
)
{
output_register
[
iter
]
=
static_cast
<
AccT
>
(
output
[
batch_id
*
element_count
+
element_index
]);
grad_output_register
[
iter
]
=
static_cast
<
AccT
>
(
grad_output
[
batch_id
*
element_count
+
element_index
]);
}
else
{
output_register
[
iter
]
=
AccT
(
0
);
grad_output_register
[
iter
]
=
AccT
(
0
);
}
}
// 2. For each warp, accumulate all thread registers
AccT
sum
=
grad_output_register
[
0
];
#pragma unroll
for
(
int
iter
=
1
;
iter
<
warp_iter
;
++
iter
)
{
sum
+=
grad_output_register
[
iter
];
}
sum
=
WarpReduceSum
<
AccT
,
kernel_warp_size
>
(
sum
);
// 3. write result in grad_input
#pragma unroll
for
(
int
iter
=
0
;
iter
<
warp_iter
;
++
iter
)
{
int
element_index
=
thread_in_warp_idx
+
iter
*
kernel_warp_size
;
if
(
element_index
<
element_count
)
{
grad_input
[
batch_id
*
element_count
+
element_index
]
=
static_cast
<
T
>
(
(
grad_output_register
[
iter
]
-
std
::
exp
(
output_register
[
iter
])
*
sum
));
}
}
}
template
<
typename
T
,
typename
AccT
>
void
LaunchSoftmaxBackwardForLastAxis
(
T
*
grad_input
,
const
T
*
grad_output
,
const
T
*
output
,
int
dim_size
,
int
outer_size
,
gpuStream_t
stream
)
{
int
threads_per_block
=
128
;
int
near_greater_power_of_two
=
GetNearGreaterPowerOfTwo
(
dim_size
);
int
kernel_warp_size
=
(
near_greater_power_of_two
<
32
)
?
near_greater_power_of_two
:
32
;
int
warps_per_block
=
(
threads_per_block
/
kernel_warp_size
);
int
blocks
=
(
outer_size
+
warps_per_block
-
1
)
/
warps_per_block
;
dim3
threads
(
kernel_warp_size
,
warps_per_block
,
1
);
switch
(
near_greater_power_of_two
)
{
LAUNCH_WARP_BACKWARD_COMPUTE
(
1
);
// dim_size: 1
LAUNCH_WARP_BACKWARD_COMPUTE
(
2
);
// dim_size: 2
LAUNCH_WARP_BACKWARD_COMPUTE
(
4
);
// dim_size: 3~4
LAUNCH_WARP_BACKWARD_COMPUTE
(
8
);
// dim_size: 5~8
LAUNCH_WARP_BACKWARD_COMPUTE
(
16
);
// dim_size: 9~16
LAUNCH_WARP_BACKWARD_COMPUTE
(
32
);
// dim_size: 17~32
LAUNCH_WARP_BACKWARD_COMPUTE
(
64
);
// dim_size: 33~64
LAUNCH_WARP_BACKWARD_COMPUTE
(
128
);
// dim_size: 65~128
LAUNCH_WARP_BACKWARD_COMPUTE
(
256
);
// dim_size: 129~256
LAUNCH_WARP_BACKWARD_COMPUTE
(
512
);
// dim_size: 257~512
LAUNCH_WARP_BACKWARD_COMPUTE
(
1024
);
// dim_size: 513~1024
default:
break
;
}
}
template
<
typename
T
>
class
LogSoftmaxGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
auto
*
out
=
context
.
Input
<
framework
::
Tensor
>
(
"Out"
);
const
auto
*
g_out
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
g_x
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
const
auto
*
out_data
=
out
->
data
<
T
>
();
const
auto
*
g_out_data
=
g_out
->
data
<
T
>
();
auto
*
g_x_data
=
g_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
int
rank
=
out
->
dims
().
size
();
const
int
axis
=
CanonicalAxis
(
context
.
Attr
<
int
>
(
"axis"
),
rank
);
int
dim_size
=
out
->
dims
()[
axis
];
int
inner_size
=
1
;
for
(
int
i
=
axis
+
1
;
i
<
out
->
dims
().
size
();
++
i
)
{
inner_size
*=
out
->
dims
()[
i
];
}
int
outer_size
=
SizeToAxis
(
axis
,
out
->
dims
());
gpuStream_t
stream
=
context
.
cuda_device_context
().
stream
();
if
(
inner_size
==
1
&&
dim_size
<=
1024
&&
dim_size
*
sizeof
(
T
)
<=
4096
)
{
LaunchSoftmaxBackwardForLastAxis
<
T
,
MPDType
>
(
g_x_data
,
g_out_data
,
out_data
,
dim_size
,
outer_size
,
stream
);
}
else
{
LogSoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
()(
context
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
out
,
g_out
,
g_x
,
axis
);
}
}
};
}
// operators
}
// paddle
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录