Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
f17a73e9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f17a73e9
编写于
9月 30, 2022
作者:
Z
Zhang Zheng
提交者:
GitHub
9月 30, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize performance of depthwise_conv_bwd (#46362)
* Optimize performance of depthwise_conv_bwd * fix
上级
2e231402
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
60 addition
and
47 deletion
+60
-47
paddle/phi/kernels/gpu/depthwise_conv.h
paddle/phi/kernels/gpu/depthwise_conv.h
+60
-47
未找到文件。
paddle/phi/kernels/gpu/depthwise_conv.h
浏览文件 @
f17a73e9
...
...
@@ -469,60 +469,62 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
const int dilate_height, const int dilate_width, \
T *const input_grad_data
template
<
typename
T
,
bool
fuse_relu_before_conv
>
template
<
typename
T
,
int
c_filter
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvInputGradNCHW
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
c_out_start
=
c_in
*
filter_multiplier
;
int
h_out_start
=
h_in
-
(
filter_height
-
1
)
*
dilate_height
+
padding_height
;
int
h_out_end
=
h_in
+
padding_height
;
int
w_out_start
=
w_in
-
(
filter_width
-
1
)
*
dilate_width
+
padding_width
;
int
w_out_end
=
w_in
+
padding_width
;
T
value
(
0
);
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
const
int
fw_size
=
c_filter
!=
-
1
?
c_filter
:
filter_width
;
const
int
fh_size
=
c_filter
!=
-
1
?
c_filter
:
filter_height
;
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
>=
batch_size
*
input_channels
*
input_height
*
input_width
)
{
return
;
}
if
(
fuse_relu_before_conv
)
{
if
(
input_data
[
index
]
<=
T
(
0
))
{
input_grad_data
[
inde
x
]
=
0
;
continue
;
if
(
input_data
[
idx
]
<=
static_cast
<
T
>
(
0.0
f
))
{
input_grad_data
[
id
x
]
=
0
;
return
;
}
}
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filter_multiplier
;
c_out
++
)
{
int
filter_offset
=
(
c_out
+
1
)
*
filter_height
*
filter_width
;
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
h_out
+=
dilate_height
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
w_out
+=
dilate_width
)
{
filter_offset
--
;
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
int
tmp_1
=
idx
/
input_width
;
const
int
w_in
=
idx
-
tmp_1
*
input_width
;
int
tmp_2
=
tmp_1
/
input_height
;
const
int
h_in
=
tmp_1
-
tmp_2
*
input_height
;
tmp_1
=
tmp_2
;
tmp_2
=
tmp_1
/
input_channels
;
const
int
c_in
=
tmp_1
-
tmp_2
*
input_channels
;
const
int
batch
=
tmp_2
;
T
value
(
0
);
for
(
int
c_mul
=
0
;
c_mul
<
filter_multiplier
;
++
c_mul
)
{
int
c_out
=
c_in
*
filter_multiplier
+
c_mul
;
int
filter_offset
=
c_out
*
filter_height
*
filter_width
;
#pragma unroll
for
(
int
fh
=
0
;
fh
<
fh_size
;
++
fh
)
{
#pragma unroll
for
(
int
fw
=
0
;
fw
<
fw_size
;
++
fw
)
{
int
h_out
=
h_in
+
padding_height
-
fh
*
dilate_height
;
int
w_out
=
w_in
+
padding_width
-
fw
*
dilate_width
;
if
((
h_out
-
h_out
/
stride_height
*
stride_height
==
0
)
&&
(
w_out
-
w_out
/
stride_width
*
stride_width
==
0
))
{
h_out
/=
stride_height
;
w_out
/=
stride_width
;
if
(
h_out
>=
0
&&
h_out
<
output_height
&&
w_out
>=
0
&&
w_out
<
output_width
)
{
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
((
batch
*
output_channels
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
s_
w_out
;
w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
}
}
filter_offset
++
;
}
}
input_grad_data
[
index
]
=
value
;
}
}
input_grad_data
[
idx
]
=
value
;
}
template
<
typename
T
,
bool
fuse_relu_before_conv
>
...
...
@@ -735,7 +737,7 @@ __global__ void KernelDepthwiseConvInputGradSp(
if
(
c_filter_multiplier
==
0
||
c_filter
==
-
1
)
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvInputGradNCHW
<
T
,
fuse_relu_before_conv
>
(
KernelDepthwiseConvInputGradNCHW
<
T
,
c_filter
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
...
...
@@ -1247,8 +1249,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
batch_size
);
}
int
filter_multiplier
=
output_channels
/
input_channels
;
int
nums_output
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
nums_output
=
output
->
numel
();
#ifdef __HIPCC__
int
block_size
=
256
;
#else
...
...
@@ -1421,6 +1422,13 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
batch_size
);
}
int
filter_multiplier
=
output_channels
/
input_channels
;
int
nums_input
=
input_grad
->
numel
();
#ifdef __HIPCC__
int
block_size
=
256
;
#else
int
block_size
=
512
;
#endif
int
grid_size
=
(
nums_input
+
block_size
-
1
)
/
block_size
;
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
...
...
@@ -1429,6 +1437,11 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
if (data_layout != DataLayout::kNHWC) { \
if (c_filter == -1) { \
threads.x = block_size; \
grid.x = grid_size; \
threads.y = threads.z = grid.y = grid.z = 1; \
} \
KernelDepthwiseConvInputGradSp<T, \
c_filter_multiplier, \
c_stride, \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录