Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
363b25aa
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
363b25aa
编写于
4月 07, 2021
作者:
O
Ouyang Chao
提交者:
GitHub
4月 07, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
improve performance of DepthwiseConv(NHWC) (#31677)
* improve performance of DepthwiseConv(NWHC)
上级
10af966a
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
683 addition
and
332 deletion
+683
-332
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+29
-75
paddle/fluid/operators/math/depthwise_conv.cu
paddle/fluid/operators/math/depthwise_conv.cu
+654
-249
python/paddle/nn/functional/conv.py
python/paddle/nn/functional/conv.py
+0
-8
未找到文件。
paddle/fluid/operators/conv_op.h
浏览文件 @
363b25aa
...
...
@@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
"and input channel number is %d"
,
output
->
dims
()[
1
],
input
->
dims
()[
1
]));
}
// transform tensor
Tensor
transformed_input
(
input
->
type
());
Tensor
transformed_output
(
output
->
type
());
if
(
channel_last
)
{
ResizeToChannelFirst
<
DeviceContext
,
T
>
(
context
,
input
,
&
transformed_input
);
TransToChannelFirst
<
DeviceContext
,
T
>
(
context
,
input
,
&
transformed_input
);
ResizeToChannelFirst
<
DeviceContext
,
T
>
(
context
,
output
,
&
transformed_output
);
}
else
{
transformed_input
=
*
input
;
transformed_output
=
*
output
;
}
// update padding and dilation
auto
in_dims
=
transformed_input
.
dims
();
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
.
dims
();
framework
::
DDim
in_data_dims
;
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
const
framework
::
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_format
);
if
(
data_layout
!=
framework
::
DataLayout
::
kNHWC
)
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
else
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
framework
::
DDim
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
...
...
@@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
if
(
fuse_relu
)
{
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
,
true
>
depthwiseConv
;
depthwiseConv
(
dev_ctx
,
transformed_input
,
filter
,
strides
,
padding
s
,
dilations
,
&
transformed_outp
ut
);
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
dilation
s
,
output
,
data_layo
ut
);
}
else
{
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
,
false
>
depthwiseConv
;
depthwiseConv
(
dev_ctx
,
transformed_input
,
filter
,
strides
,
paddings
,
dilations
,
&
transformed_output
);
}
if
(
channel_last
)
{
TransToChannelLast
<
DeviceContext
,
T
>
(
context
,
&
transformed_output
,
output
);
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
dilations
,
output
,
data_layout
);
}
}
};
...
...
@@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
context
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
(
data_format
==
"NHWC"
||
data_format
==
"NDHWC"
);
// transform Tensor
Tensor
transformed_input
(
input
->
type
());
Tensor
transformed_output_grad
(
output_grad
->
type
());
if
(
channel_last
)
{
ResizeToChannelFirst
<
DeviceContext
,
T
>
(
context
,
input
,
&
transformed_input
);
TransToChannelFirst
<
DeviceContext
,
T
>
(
context
,
input
,
&
transformed_input
);
ResizeToChannelFirst
<
DeviceContext
,
T
>
(
context
,
output_grad
,
&
transformed_output_grad
);
TransToChannelFirst
<
DeviceContext
,
T
>
(
context
,
output_grad
,
&
transformed_output_grad
);
}
else
{
transformed_input
=
*
input
;
transformed_output_grad
=
*
output_grad
;
}
// update padding and dilation
auto
in_dims
=
transformed_input
.
dims
();
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
.
dims
();
framework
::
DDim
in_data_dims
;
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
const
framework
::
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_format
);
if
(
data_layout
!=
framework
::
DataLayout
::
kNHWC
)
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
else
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
framework
::
DDim
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
std
::
vector
<
int
>
ksize
=
framework
::
vectorize
<
int
>
(
filter_data_dims
);
...
...
@@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
Tensor
transformed_input_grad
(
input_grad
->
type
());
if
(
channel_last
)
{
ResizeToChannelFirst
<
DeviceContext
,
T
>
(
context
,
input_grad
,
&
transformed_input_grad
);
}
else
{
transformed_input_grad
=
*
input_grad
;
}
set_zero
(
dev_ctx
,
&
transformed_input_grad
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
if
(
fuse_relu
)
{
math
::
DepthwiseConvInputGradFunctor
<
DeviceContext
,
T
,
true
>
depthwiseConvInputGrad
;
depthwiseConvInputGrad
(
dev_ctx
,
transformed_input
,
filter
,
transformed_output_grad
,
strides
,
paddings
,
dilations
,
&
transformed_input_grad
);
depthwiseConvInputGrad
(
dev_ctx
,
*
input
,
filter
,
*
output_grad
,
strides
,
paddings
,
dilations
,
input_grad
,
data_layout
);
}
else
{
math
::
DepthwiseConvInputGradFunctor
<
DeviceContext
,
T
,
false
>
depthwiseConvInputGrad
;
depthwiseConvInputGrad
(
dev_ctx
,
transformed_input
,
filter
,
transformed_output_grad
,
strides
,
paddings
,
dilations
,
&
transformed_input_grad
);
}
if
(
channel_last
)
{
TransToChannelLast
<
DeviceContext
,
T
>
(
context
,
&
transformed_input_grad
,
input_grad
);
depthwiseConvInputGrad
(
dev_ctx
,
*
input
,
filter
,
*
output_grad
,
strides
,
paddings
,
dilations
,
input_grad
,
data_layout
);
}
}
...
...
@@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
if
(
fuse_relu
)
{
math
::
DepthwiseConvFilterGradFunctor
<
DeviceContext
,
T
,
true
>
depthwiseConvFilterGrad
;
depthwiseConvFilterGrad
(
dev_ctx
,
transformed_input
,
transformed_output_grad
,
strides
,
paddings
,
dilations
,
filter_grad
);
depthwiseConvFilterGrad
(
dev_ctx
,
*
input
,
*
output_grad
,
strides
,
paddings
,
dilations
,
filter_grad
,
data_layout
);
}
else
{
math
::
DepthwiseConvFilterGradFunctor
<
DeviceContext
,
T
,
false
>
depthwiseConvFilterGrad
;
depthwiseConvFilterGrad
(
dev_ctx
,
transformed_input
,
transformed_output_grad
,
strides
,
paddings
,
dilations
,
filter_grad
);
depthwiseConvFilterGrad
(
dev_ctx
,
*
input
,
*
output_grad
,
strides
,
paddings
,
dilations
,
filter_grad
,
data_layout
);
}
}
}
...
...
paddle/fluid/operators/math/depthwise_conv.cu
浏览文件 @
363b25aa
...
...
@@ -22,6 +22,7 @@ limitations under the License. */
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
...
...
@@ -52,8 +53,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
const int filter_multiplier, const int filter_height, \
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, T *const output_data, \
const DataLayout data_layout = DataLayout::kNCHW
const int dilate_height, const int dilate_width, T *const output_data
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
...
...
@@ -123,7 +123,6 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
const
int
batch
=
idx
/
output_width
/
output_height
/
output_channels
;
const
int
c_in
=
c_out
/
filter_multiplier
;
const
T
*
weight
=
filter_data
+
c_out
*
filter_height
*
filter_width
;
T
value
=
0
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
...
...
@@ -142,13 +141,14 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
for
(
int
w_in
=
w_in_start
;
w_in
<
w_in_end
;
w_in
+=
dilate_width
)
{
if
(
h_in
>=
h_start
&&
h_in
<
h_end
&&
w_in
>=
w_start
&&
w_in
<
w_end
)
{
int
offset
=
((
batch
*
input_height
+
h_in
)
*
input_width
+
w_in
)
*
out
put_channels
+
in
put_channels
+
c_in
;
T
in_data
=
input_data
[
offset
];
const
T
*
weight
=
filter_data
+
weight_offset
*
output_channels
+
c_out
;
if
(
fuse_relu_before_conv
)
{
value
+=
weight
[
weight_offset
]
*
max
(
0.0
f
,
in_data
);
value
+=
weight
[
0
]
*
max
(
0.0
f
,
in_data
);
}
else
{
value
+=
weight
[
weight_offset
]
*
in_data
;
value
+=
weight
[
0
]
*
in_data
;
}
}
weight_offset
++
;
...
...
@@ -161,10 +161,10 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
}
template
<
typename
T
,
int
c_filter
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvCFilter
(
__device__
__inline__
void
KernelDepthwiseConvCFilter
NCHW
(
ARG_DEFINE_KernelDepthwiseConv
)
{
const
int
kWeghtSize
=
c_filter
*
c_filter
;
T
r_weight
[
kWeghtSize
];
const
int
kWe
i
ghtSize
=
c_filter
*
c_filter
;
T
r_weight
[
kWe
i
ghtSize
];
const
int
batch
=
blockIdx
.
y
;
const
int
c_out
=
blockIdx
.
x
;
const
T
*
weight
=
filter_data
+
c_out
*
c_filter
*
c_filter
;
...
...
@@ -182,13 +182,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
const
int
h_in_end
=
h_in_start
+
c_filter
*
dilate_height
;
const
int
w_in_end
=
w_in_start
+
c_filter
*
dilate_width
;
int
in_offset
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
in_offset
=
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
}
else
{
in_offset
=
batch
*
input_height
*
input_width
*
input_channels
;
}
int
in_offset
=
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
const
int
h_end
=
h_in_end
<
input_height
?
h_in_end
:
input_height
;
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
...
...
@@ -201,13 +196,63 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
w_in
+=
dilate_width
,
w_f
++
)
{
if
(
h_in
>=
0
&&
h_in
<
input_height
&&
w_in
>=
0
&&
w_in
<
input_width
)
{
int
offset
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
if
(
fuse_relu_before_conv
)
{
value
+=
r_weight
[
h_f
*
c_filter
+
w_f
]
*
max
(
0.0
f
,
input_data
[
offset
]);
}
else
{
offset
=
in_offset
+
(
h_in
*
input_width
+
w_in
)
*
input_channels
+
c_in
;
value
+=
r_weight
[
h_f
*
c_filter
+
w_f
]
*
input_data
[
offset
];
}
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
w_out
;
output_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvCFilterNHWC
(
ARG_DEFINE_KernelDepthwiseConv
)
{
const
int
batch
=
blockIdx
.
z
;
int
h_out
=
blockIdx
.
x
*
dilate_height
+
blockIdx
.
y
;
if
(
h_out
>=
output_height
)
{
return
;
}
int
in_offset
=
batch
*
input_height
*
input_width
*
input_channels
;
int
out_offset
=
(
batch
*
output_height
+
h_out
)
*
output_width
*
output_channels
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
wi_size
=
(
output_width
+
dilate_width
-
1
)
/
dilate_width
;
const
int
kWeightSize
=
c_filter
*
c_filter
;
T
r_weight
[
kWeightSize
];
for
(
int
c_out
=
threadIdx
.
x
;
c_out
<
output_channels
;
c_out
+=
blockDim
.
x
)
{
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
i
++
)
{
const
T
*
weight
=
filter_data
+
i
*
output_channels
+
c_out
;
r_weight
[
i
]
=
weight
[
0
];
}
const
int
c_in
=
c_out
/
filter_multiplier
;
for
(
int
i
=
threadIdx
.
y
;
i
<
wi_size
*
dilate_width
;
i
+=
blockDim
.
y
)
{
int
i_dw
=
i
/
wi_size
;
int
i_wi
=
i
-
i_dw
*
wi_size
;
int
w_out
=
i_wi
*
dilate_width
+
i_dw
;
if
(
w_out
>=
output_width
)
{
continue
;
}
T
value
=
0
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
for
(
int
h_in
=
h_in_start
,
h_f
=
0
;
h_f
<
c_filter
;
h_in
+=
dilate_height
,
h_f
++
)
{
for
(
int
w_in
=
w_in_start
,
w_f
=
0
;
w_f
<
c_filter
;
w_in
+=
dilate_width
,
w_f
++
)
{
if
(
h_in
>=
0
&&
h_in
<
input_height
&&
w_in
>=
0
&&
w_in
<
input_width
)
{
int
offset
=
in_offset
+
(
h_in
*
input_width
+
w_in
)
*
input_channels
+
c_in
;
if
(
fuse_relu_before_conv
)
{
value
+=
r_weight
[
h_f
*
c_filter
+
w_f
]
*
max
(
0.0
f
,
input_data
[
offset
]);
...
...
@@ -217,23 +262,14 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
}
}
}
int
index
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
index
=
((
batch
*
gridDim
.
x
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
w_out
;
}
else
{
index
=
((
batch
*
output_height
+
h_out
)
*
output_width
+
w_out
)
*
gridDim
.
x
+
c_out
;
}
int
index
=
out_offset
+
w_out
*
output_channels
+
c_out
;
output_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
,
int
c_filter
,
bool
fuse_relu_before_conv
>
DataLayout
data_layout
,
bool
fuse_relu_before_conv
>
__global__
void
KernelDepthwiseConvSp
(
ARG_DEFINE_KernelDepthwiseConv
)
{
int
final_filter_multiplier
=
filter_multiplier
;
int
h_stride
=
stride_height
;
...
...
@@ -244,28 +280,37 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
w_stride
=
c_stride
;
}
if
(
c_filter
==
-
1
)
{
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvNCHW
<
T
,
fuse_relu_before_conv
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
,
data_layout
);
output_data
);
}
else
{
KernelDepthwiseConvNHWC
<
T
,
fuse_relu_before_conv
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
,
data_layout
);
output_data
);
}
}
else
{
KernelDepthwiseConvCFilter
<
T
,
c_filter
,
fuse_relu_before_conv
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
,
data_layout
);
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvCFilterNCHW
<
T
,
c_filter
,
fuse_relu_before_conv
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
else
{
KernelDepthwiseConvCFilterNHWC
<
T
,
c_filter
,
fuse_relu_before_conv
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
}
}
...
...
@@ -280,40 +325,27 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, \
T *const input_grad_data, \
const DataLayout data_layout = DataLayout::kNCHW
T *const input_grad_data
template
<
typename
T
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvInputGrad
(
__device__
__inline__
void
KernelDepthwiseConvInputGrad
NCHW
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
const
int
c_out_start
=
c_in
*
filter_multiplier
;
int
h_out_start
=
h_in
-
(
filter_height
-
1
)
*
dilate_height
+
padding_height
;
int
h_out_end
=
h_in
+
padding_height
;
int
w_out_start
=
w_in
-
(
filter_width
-
1
)
*
dilate_width
+
padding_width
;
int
w_out_end
=
w_in
+
padding_width
;
T
value
=
0
;
int
index
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
}
else
{
index
=
((
batch
*
input_height
+
h_in
)
*
input_width
+
w_in
)
*
gridDim
.
x
+
c_in
;
}
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
if
(
fuse_relu_before_conv
)
{
if
(
input_data
[
index
]
<=
0
)
{
...
...
@@ -335,20 +367,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
int
output_grad_offset
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
;
}
else
{
output_grad_offset
=
((
batch
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
)
*
output_channels
+
c_out
;
}
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
}
}
}
}
input_grad_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvInputGradNHWC
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
const
int
batch
=
blockIdx
.
z
;
int
h_in
=
blockIdx
.
x
*
dilate_height
+
blockIdx
.
y
;
if
(
h_in
>=
input_height
)
{
return
;
}
for
(
int
c_in
=
threadIdx
.
x
;
c_in
<
input_channels
;
c_in
+=
blockDim
.
x
)
{
for
(
int
w_in
=
threadIdx
.
y
;
w_in
<
input_width
;
w_in
+=
blockDim
.
y
)
{
int
h_out_start
=
h_in
-
(
filter_height
-
1
)
*
dilate_height
+
padding_height
;
int
w_out_start
=
w_in
-
(
filter_width
-
1
)
*
dilate_width
+
padding_width
;
T
value
=
0
;
int
index
=
((
batch
*
input_height
+
h_in
)
*
input_width
+
w_in
)
*
input_channels
+
c_in
;
if
(
fuse_relu_before_conv
)
{
if
(
input_data
[
index
]
<=
0
)
{
input_grad_data
[
index
]
=
0
;
continue
;
}
}
for
(
int
c_i
=
0
;
c_i
<
filter_multiplier
;
c_i
++
)
{
int
c_out
=
c_in
*
filter_multiplier
+
c_i
;
int
weight_offset
=
filter_height
*
filter_width
;
for
(
int
h_out
=
h_out_start
,
h_f
=
0
;
h_f
<
filter_height
;
h_out
+=
dilate_height
,
h_f
++
)
{
for
(
int
w_out
=
w_out_start
,
w_f
=
0
;
w_f
<
filter_width
;
w_out
+=
dilate_width
,
w_f
++
)
{
weight_offset
--
;
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
int
output_grad_offset
=
((
batch
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
)
*
output_channels
+
c_out
;
int
filter_offset
=
weight_offset
*
output_channels
+
c_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
}
...
...
@@ -362,10 +441,10 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
template
<
typename
T
,
int
c_filter
,
int
c_filter_multiplier
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvInputGradCFilter
(
__device__
__inline__
void
KernelDepthwiseConvInputGradCFilter
NCHW
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
const
int
kWeghtSize
=
c_filter
*
c_filter
*
c_filter_multiplier
+
1
;
T
r_weight
[
kWeghtSize
];
const
int
kWe
i
ghtSize
=
c_filter
*
c_filter
*
c_filter_multiplier
+
1
;
T
r_weight
[
kWe
i
ghtSize
];
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
...
...
@@ -379,24 +458,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
int
h_out_start
=
h_in
-
(
c_filter
-
1
)
*
dilate_height
+
padding_height
;
int
w_out_start
=
w_in
-
(
c_filter
-
1
)
*
dilate_width
+
padding_width
;
T
value
=
0
;
int
index
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
}
else
{
index
=
((
batch
*
input_height
+
h_in
)
*
input_width
+
w_in
)
*
gridDim
.
x
+
c_in
;
}
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
if
(
fuse_relu_before_conv
)
{
if
(
input_data
[
index
]
<=
0
)
{
input_grad_data
[
index
]
=
0
;
...
...
@@ -415,20 +483,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
int
output_grad_offset
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
;
}
else
{
output_grad_offset
=
((
batch
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
)
*
output_channels
+
c_out
;
}
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
r_weight
[
h_f
*
c_filter
+
w_f
+
c_i
*
c_filter
*
c_filter
];
...
...
@@ -441,47 +500,137 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
}
}
template
<
typename
T
,
int
c_filter
_multiplier
,
int
c_stride
,
int
c_filt
er
,
template
<
typename
T
,
int
c_filter
,
int
c_filter_multipli
er
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvInputGradCFilterNHWC
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
int
h_in
=
blockIdx
.
x
*
dilate_height
+
blockIdx
.
y
;
if
(
h_in
>=
input_height
)
{
return
;
}
const
int
kWeightSize
=
c_filter
*
c_filter
*
c_filter_multiplier
+
1
;
T
r_weight
[
kWeightSize
];
const
int
batch
=
blockIdx
.
z
;
const
int
wi_size
=
(
input_width
+
dilate_width
-
1
)
/
dilate_width
;
const
int
h_out_start
=
h_in
-
(
c_filter
-
1
)
*
dilate_height
+
padding_height
;
for
(
int
c_in
=
threadIdx
.
x
;
c_in
<
input_channels
;
c_in
+=
blockDim
.
x
)
{
for
(
int
c_i
=
0
;
c_i
<
c_filter_multiplier
;
c_i
++
)
{
int
c_out
=
c_in
*
c_filter_multiplier
+
c_i
;
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
i
++
)
r_weight
[
i
+
c_i
*
c_filter
*
c_filter
]
=
filter_data
[(
c_filter
*
c_filter
-
i
-
1
)
*
output_channels
+
c_out
];
}
for
(
int
i
=
threadIdx
.
y
;
i
<
wi_size
*
dilate_width
;
i
+=
blockDim
.
y
)
{
int
i_dw
=
i
/
wi_size
;
int
i_wi
=
i
-
i_dw
*
wi_size
;
int
w_in
=
i_wi
*
dilate_width
+
i_dw
;
if
(
w_in
>=
input_width
)
{
continue
;
}
int
w_out_start
=
w_in
-
(
c_filter
-
1
)
*
dilate_width
+
padding_width
;
T
value
=
0
;
int
index
=
((
batch
*
input_height
+
h_in
)
*
input_width
+
w_in
)
*
input_channels
+
c_in
;
if
(
fuse_relu_before_conv
)
{
if
(
input_data
[
index
]
<=
0
)
{
input_grad_data
[
index
]
=
0
;
continue
;
}
}
for
(
int
c_i
=
0
;
c_i
<
c_filter_multiplier
;
c_i
++
)
{
int
c_out
=
c_in
*
c_filter_multiplier
+
c_i
;
for
(
int
h_out
=
h_out_start
,
h_f
=
0
;
h_f
<
c_filter
;
h_out
+=
dilate_height
,
h_f
++
)
{
for
(
int
w_out
=
w_out_start
,
w_f
=
0
;
w_f
<
c_filter
;
w_out
+=
dilate_width
,
w_f
++
)
{
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
int
output_grad_offset
=
((
batch
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
)
*
output_channels
+
c_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
r_weight
[
h_f
*
c_filter
+
w_f
+
c_i
*
c_filter
*
c_filter
];
}
}
}
}
input_grad_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
,
int
c_filter
,
DataLayout
data_layout
,
bool
fuse_relu_before_conv
>
__global__
void
KernelDepthwiseConvInputGradSp
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvInputGrad
<
T
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
,
data_layout
);
else
if
(
c_filter
==
-
1
)
KernelDepthwiseConvInputGrad
<
T
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
,
data_layout
);
else
KernelDepthwiseConvInputGradCFilter
<
T
,
c_filter
,
c_filter_multiplier
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
,
data_layout
);
int
final_filter_multiplier
=
filter_multiplier
;
int
h_stride
=
stride_height
;
int
w_stride
=
stride_width
;
if
(
c_filter_multiplier
!=
0
)
{
final_filter_multiplier
=
c_filter_multiplier
;
h_stride
=
c_stride
;
w_stride
=
c_stride
;
}
if
(
c_filter_multiplier
==
0
||
c_filter
==
-
1
)
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvInputGradNCHW
<
T
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
else
{
KernelDepthwiseConvInputGradNHWC
<
T
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
}
else
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvInputGradCFilterNCHW
<
T
,
c_filter
,
c_filter_multiplier
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
else
{
KernelDepthwiseConvInputGradCFilterNHWC
<
T
,
c_filter
,
c_filter_multiplier
,
fuse_relu_before_conv
>
(
input_data
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
}
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template
<
typename
T
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvFilterGrad
(
__device__
__inline__
void
KernelDepthwiseConvFilterGrad
NCHW
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
,
const
DataLayout
data_layout
=
DataLayout
::
kNCHW
)
{
const
int
dilate_width
,
T
*
filter_grad_data
)
{
T
s
=
0
;
int
gbid
=
((
blockIdx
.
z
*
gridDim
.
y
)
+
blockIdx
.
y
)
*
gridDim
.
x
+
blockIdx
.
x
;
for
(
int
image_w
=
threadIdx
.
x
;
image_w
<
output_width
;
...
...
@@ -499,45 +648,137 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
if
(
image_wk
<
0
||
image_wk
>=
input_width
)
continue
;
#define gaid(N, C, H, W) \
((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
#define gaid_nhwc(N, H, W, C) \
((((N)*output_height + (H)) * output_width + (W)) * gridDim.z + (C))
int
input_id
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
input_id
=
((
bid
*
(
gridDim
.
z
/
filter_multiplier
)
+
kernel_id
/
filter_multiplier
)
*
input_height
+
image_hk
)
*
input_width
+
image_wk
;
if
(
fuse_relu_before_conv
)
{
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
max
(
0.0
f
,
input_data
[
input_id
]);
}
else
{
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
input_data
[
input_id
];
}
int
input_id
=
((
bid
*
(
gridDim
.
z
/
filter_multiplier
)
+
kernel_id
/
filter_multiplier
)
*
input_height
+
image_hk
)
*
input_width
+
image_wk
;
if
(
fuse_relu_before_conv
)
{
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
max
(
0.0
f
,
input_data
[
input_id
]);
}
else
{
input_id
=
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
input_data
[
input_id
];
}
#undef gaid
}
}
}
CudaAtomicAddWithWarp
(
&
filter_grad_data
[
gbid
],
s
);
}
template
<
typename
T
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvFilterGradNHWC
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
int
bid
=
blockIdx
.
z
;
int
image_h
=
blockIdx
.
y
;
int
kernel_iw
=
blockIdx
.
x
%
filter_width
;
int
kernel_ih
=
blockIdx
.
x
/
filter_width
;
for
(
int
kernel_id
=
threadIdx
.
x
;
kernel_id
<
output_channels
;
kernel_id
+=
blockDim
.
x
)
{
T
s
=
0
;
int
gbid
=
((
kernel_id
*
filter_height
)
+
kernel_ih
)
*
filter_width
+
kernel_iw
;
for
(
int
image_w
=
threadIdx
.
y
;
image_w
<
output_width
;
image_w
+=
blockDim
.
y
)
{
int
kernel_h
=
kernel_ih
*
dilate_height
-
padding_height
;
int
kernel_w
=
kernel_iw
*
dilate_width
-
padding_width
;
int
image_hk
=
image_h
*
stride_height
+
kernel_h
;
int
image_wk
=
image_w
*
stride_width
+
kernel_w
;
if
(
image_hk
<
0
||
image_hk
>=
input_height
)
continue
;
if
(
image_wk
<
0
||
image_wk
>=
input_width
)
continue
;
#define gaid(N, H, W, C) \
((((N)*output_height + (H)) * output_width + (W)) * output_channels + (C))
int
input_id
=
((
bid
*
input_height
+
image_hk
)
*
input_width
+
image_wk
)
*
input_channels
+
kernel_id
/
filter_multiplier
;
if
(
fuse_relu_before_conv
)
{
s
+=
output_grad_data
[
gaid
(
bid
,
image_h
,
image_w
,
kernel_id
)]
*
max
(
0.0
f
,
input_data
[
input_id
]);
}
else
{
s
+=
output_grad_data
[
gaid
(
bid
,
image_h
,
image_w
,
kernel_id
)]
*
input_data
[
input_id
];
}
#undef gaid
}
platform
::
CudaAtomicAdd
(
&
filter_grad_data
[
gbid
],
s
);
}
}
template
<
typename
T
,
int
c_filter
,
bool
fuse_relu_before_conv
>
__device__
__inline__
void
KernelDepthwiseConvFilterGradCFilterNHWC
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
const
int
bid
=
blockIdx
.
z
;
int
image_h
=
blockIdx
.
x
*
dilate_height
+
blockIdx
.
y
;
if
(
image_h
>=
output_height
)
{
return
;
}
const
int
kWeightSize
=
c_filter
*
c_filter
;
T
r_weight
[
kWeightSize
];
const
int
wi_size
=
(
output_width
+
dilate_width
-
1
)
/
dilate_width
;
for
(
int
kernel_id
=
threadIdx
.
x
;
kernel_id
<
output_channels
;
kernel_id
+=
blockDim
.
x
)
{
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
++
i
)
{
r_weight
[
i
]
=
0
;
}
for
(
int
i
=
threadIdx
.
y
;
i
<
wi_size
*
dilate_width
;
i
+=
blockDim
.
y
)
{
int
i_dw
=
i
/
wi_size
;
int
i_wi
=
i
-
i_dw
*
wi_size
;
int
image_w
=
i_wi
*
dilate_width
+
i_dw
;
if
(
image_w
>=
output_width
)
{
continue
;
}
for
(
int
kernel_ih
=
0
;
kernel_ih
<
c_filter
;
++
kernel_ih
)
{
for
(
int
kernel_iw
=
0
;
kernel_iw
<
c_filter
;
++
kernel_iw
)
{
int
kernel_h
=
kernel_ih
*
dilate_height
-
padding_height
;
int
kernel_w
=
kernel_iw
*
dilate_width
-
padding_width
;
int
image_hk
=
image_h
*
stride_height
+
kernel_h
;
int
image_wk
=
image_w
*
stride_width
+
kernel_w
;
if
(
image_hk
<
0
||
image_hk
>=
input_height
)
continue
;
if
(
image_wk
<
0
||
image_wk
>=
input_width
)
continue
;
int
input_id
=
((
bid
*
input_height
+
image_hk
)
*
input_width
+
image_wk
)
*
(
gridDim
.
z
/
filter_multiplier
)
+
input_channels
+
kernel_id
/
filter_multiplier
;
int
output_id
=
((
bid
*
output_height
+
image_h
)
*
output_width
+
image_w
)
*
output_channels
+
kernel_id
;
T
s
=
0
;
if
(
fuse_relu_before_conv
)
{
s
+=
output_grad_data
[
gaid_nhwc
(
bid
,
image_h
,
image_w
,
kernel_id
)]
*
max
(
0.0
f
,
input_data
[
input_id
]);
s
=
output_grad_data
[
output_id
]
*
max
(
0.0
f
,
input_data
[
input_id
]);
}
else
{
s
+=
output_grad_data
[
gaid_nhwc
(
bid
,
image_h
,
image_w
,
kernel_id
)]
*
input_data
[
input_id
];
s
=
output_grad_data
[
output_id
]
*
input_data
[
input_id
];
}
r_weight
[
kernel_ih
*
c_filter
+
kernel_iw
]
+=
s
;
}
#undef gaid
}
}
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
++
i
)
{
T
*
weight
=
filter_grad_data
+
i
*
output_channels
+
kernel_id
;
platform
::
CudaAtomicAdd
(
&
weight
[
0
],
r_weight
[
i
]);
}
}
CudaAtomicAddWithWarp
(
&
filter_grad_data
[
gbid
],
s
);
}
template
<
typename
T
,
int
c_filter_multiplier
,
bool
fuse_relu_before_conv
>
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
,
int
c_filter
,
DataLayout
data_layout
,
bool
fuse_relu_before_conv
>
__global__
void
KernelDepthwiseConvFilterGradSp
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
...
...
@@ -545,22 +786,49 @@ __global__ void KernelDepthwiseConvFilterGradSp(
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
,
const
DataLayout
data_layout
=
DataLayout
::
kNCHW
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvFilterGrad
<
T
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
,
data_layout
);
else
KernelDepthwiseConvFilterGrad
<
T
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
,
data_layout
);
const
int
dilate_width
,
T
*
filter_grad_data
)
{
int
final_filter_multiplier
=
filter_multiplier
;
int
h_stride
=
stride_height
;
int
w_stride
=
stride_width
;
if
(
c_filter_multiplier
!=
0
)
{
final_filter_multiplier
=
c_filter_multiplier
;
h_stride
=
c_stride
;
w_stride
=
c_stride
;
}
if
(
c_filter_multiplier
==
0
||
c_filter
==
-
1
)
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvFilterGradNCHW
<
T
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
else
{
KernelDepthwiseConvFilterGradNHWC
<
T
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
}
else
{
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
KernelDepthwiseConvFilterGradNCHW
<
T
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
else
{
KernelDepthwiseConvFilterGradCFilterNHWC
<
T
,
c_filter
,
fuse_relu_before_conv
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
final_filter_multiplier
,
filter_height
,
filter_width
,
h_stride
,
w_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
}
}
/*
...
...
@@ -608,19 +876,45 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
const
T
*
filter_data
=
filter
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
framework
::
Tensor
filter_hwc
;
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
framework
::
DDim
filter_hwc_dims
({
filter
.
dims
()[
2
],
filter
.
dims
()[
3
],
filter
.
dims
()[
0
],
filter
.
dims
()[
1
]});
filter_hwc
.
Resize
(
filter_hwc_dims
);
filter_hwc
.
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
vector
<
int
>
perm_axis
({
2
,
3
,
0
,
1
});
math
::
TransposeNormal
<
platform
::
CUDADeviceContext
,
T
>
trans
;
trans
(
context
,
filter
,
&
filter_hwc
,
perm_axis
);
filter_data
=
filter_hwc
.
data
<
T
>
();
}
int
thread
=
512
;
if
(
output_width
>
1024
&&
output_width
<=
2048
)
thread
=
(
output_width
-
1
)
/
2
+
1
;
else
if
(
output_width
>
512
&&
output_width
<=
1024
)
thread
=
output_width
;
int
blocks
;
dim3
threads
;
dim3
grid
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
if
(
output_width
>
1024
&&
output_width
<=
2048
)
thread
=
(
output_width
-
1
)
/
2
+
1
;
else
if
(
output_width
>
512
&&
output_width
<=
1024
)
thread
=
output_width
;
#ifdef __HIPCC__
thread
=
std
::
min
(
thread
,
256
);
#endif
blocks
=
std
::
min
(
std
::
max
(
thread
/
output_width
,
1
),
output_height
);
threads
=
dim3
(
std
::
min
(
output_width
,
thread
),
blocks
,
1
);
grid
=
dim3
(
output_channels
,
batch_size
,
1
);
}
else
{
#ifdef __HIPCC__
thread
=
std
::
min
(
thread
,
256
);
thread
=
std
::
min
(
thread
,
256
);
#endif
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
output_width
,
1
),
output_height
);
dim3
threads
(
std
::
min
(
output_width
,
thread
),
blocks
,
1
);
dim3
grid
(
output_channels
,
batch_size
,
1
);
blocks
=
std
::
min
(
std
::
max
(
thread
/
output_channels
,
1
),
((
output_width
+
dilate_width
-
1
)
/
dilate_width
)
*
dilate_width
);
threads
=
dim3
(
std
::
min
(
output_channels
,
thread
),
blocks
,
1
);
grid
=
dim3
((
output_height
+
dilate_height
-
1
)
/
dilate_height
,
dilate_height
,
batch_size
);
}
int
filter_multiplier
=
output_channels
/
input_channels
;
int
nums_output
=
batch_size
*
output_channels
*
output_height
*
output_width
;
#ifdef __HIPCC__
...
...
@@ -631,26 +925,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
int
grid_size
=
(
nums_output
+
block_size
-
1
)
/
block_size
;
#endif
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
if (c_filter == -1) { \
threads.x = block_size; \
grid.x = grid_size; \
threads.y = threads.z = grid.y = grid.z = 1; \
} \
KernelDepthwiseConvSp< \
T, c_filter_multiplier, c_stride, c_filter, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data, data_layout); \
return; \
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
if (c_filter == -1) { \
threads.x = block_size; \
grid.x = grid_size; \
threads.y = threads.z = grid.y = grid.z = 1; \
} \
if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
} else { \
KernelDepthwiseConvSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
} \
return; \
}
check_case
(
1
,
1
,
3
);
check_case
(
1
,
1
,
5
);
...
...
@@ -714,32 +1019,67 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
framework
::
Tensor
filter_hwc
;
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
framework
::
DDim
filter_hwc_dims
({
filter
.
dims
()[
2
],
filter
.
dims
()[
3
],
filter
.
dims
()[
0
],
filter
.
dims
()[
1
]});
filter_hwc
.
Resize
(
filter_hwc_dims
);
filter_hwc
.
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
vector
<
int
>
perm_axis
({
2
,
3
,
0
,
1
});
math
::
TransposeNormal
<
platform
::
CUDADeviceContext
,
T
>
trans
;
trans
(
context
,
filter
,
&
filter_hwc
,
perm_axis
);
filter_data
=
filter_hwc
.
data
<
T
>
();
}
int
thread
=
512
;
if
(
input_width
>
1024
&&
input_width
<=
2048
)
thread
=
(
input_width
-
1
)
/
2
+
1
;
else
if
(
input_width
>
512
&&
input_width
<=
1024
)
thread
=
input_width
;
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
input_width
,
1
),
input_height
);
dim3
threads
(
std
::
min
(
input_width
,
thread
),
blocks
,
1
);
dim3
grid
(
input_channels
,
batch_size
,
1
);
int
blocks
;
dim3
threads
;
dim3
grid
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
if
(
input_width
>
1024
&&
input_width
<=
2048
)
{
thread
=
(
input_width
-
1
)
/
2
+
1
;
}
else
if
(
input_width
>
512
&&
input_width
<=
1024
)
{
thread
=
input_width
;
}
blocks
=
std
::
min
(
std
::
max
(
thread
/
input_width
,
1
),
input_height
);
threads
=
dim3
(
std
::
min
(
input_width
,
thread
),
blocks
,
1
);
grid
=
dim3
(
input_channels
,
batch_size
,
1
);
}
else
{
blocks
=
std
::
min
(
std
::
max
(
thread
/
input_channels
,
1
),
((
input_width
+
dilate_width
-
1
)
/
dilate_width
)
*
dilate_width
);
threads
=
dim3
(
std
::
min
(
input_channels
,
thread
),
blocks
,
1
);
grid
=
dim3
((
input_height
+
dilate_height
-
1
)
/
dilate_height
,
dilate_height
,
batch_size
);
}
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, c_stride, c_filter, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, output_grad_data, filter_data, batch_size, \
output_channels, output_height, output_width, input_channels, \
input_height, input_width, filter_multiplier, ksize_height, \
ksize_width, stride_height, stride_width, padding_height, \
padding_width, dilate_height, dilate_width, input_grad_data, \
data_layout); \
return; \
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, output_grad_data, filter_data, batch_size, \
output_channels, output_height, output_width, input_channels, \
input_height, input_width, filter_multiplier, ksize_height, \
ksize_width, stride_height, stride_width, padding_height, \
padding_width, dilate_height, dilate_width, input_grad_data); \
} else { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
input_data, output_grad_data, filter_data, batch_size, \
output_channels, output_height, output_width, input_channels, \
input_height, input_width, filter_multiplier, ksize_height, \
ksize_width, stride_height, stride_width, padding_height, \
padding_width, dilate_height, dilate_width, input_grad_data); \
} \
return; \
}
check_case
(
1
,
1
,
3
);
check_case
(
1
,
1
,
5
);
...
...
@@ -802,30 +1142,95 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
block_size
=
512
;
if
(
output_width
>
1024
&&
output_width
<=
2048
)
block_size
=
(
output_width
-
1
)
/
2
+
1
;
else
if
(
output_width
>
512
&&
output_width
<=
1024
)
block_size
=
output_width
;
int
crop_output_height
=
std
::
min
(
std
::
max
(
block_size
/
output_width
,
1
),
output_height
);
dim3
grid
(
ksize_width
,
ksize_height
,
output_channels
);
dim3
threads
(
std
::
min
(
output_width
,
block_size
),
crop_output_height
,
1
);
int
blocks
;
dim3
threads
;
dim3
grid
;
if
(
data_layout
!=
DataLayout
::
kNHWC
)
{
if
(
output_width
>
1024
&&
output_width
<=
2048
)
{
block_size
=
(
output_width
-
1
)
/
2
+
1
;
}
else
if
(
output_width
>
512
&&
output_width
<=
1024
)
{
block_size
=
output_width
;
}
blocks
=
std
::
min
(
std
::
max
(
block_size
/
output_width
,
1
),
output_height
);
grid
=
dim3
(
ksize_width
,
ksize_height
,
output_channels
);
threads
=
dim3
(
std
::
min
(
output_width
,
block_size
),
blocks
,
1
);
}
else
{
blocks
=
std
::
min
(
std
::
max
(
block_size
/
output_channels
,
1
),
((
output_width
+
dilate_width
-
1
)
/
dilate_width
)
*
dilate_width
);
grid
=
dim3
((
output_height
+
dilate_height
-
1
)
/
dilate_height
,
dilate_height
,
batch_size
);
threads
=
dim3
(
std
::
min
(
output_channels
,
block_size
),
blocks
,
1
);
}
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier) \
if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data, data_layout); \
return; \
#define check_case(c_filter_multiplier, c_stride, c_filter) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
if (data_layout != DataLayout::kNHWC) { \
KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
} else { \
framework::Tensor filter_grad_hwc; \
if (c_filter != -1) { \
framework::DDim filter_grad_hwc_dims( \
{filter_grad->dims()[2], filter_grad->dims()[3], \
filter_grad->dims()[0], filter_grad->dims()[1]}); \
filter_grad_hwc.Resize(filter_grad_hwc_dims); \
filter_grad_hwc.mutable_data<T>(context.GetPlace()); \
math::SetConstant<platform::CUDADeviceContext, T> set_zero; \
set_zero(context, &filter_grad_hwc, static_cast<T>(0)); \
filter_grad_data = filter_grad_hwc.data<T>(); \
} else { \
block_size = 512; \
if (output_channels > 1024 && output_channels <= 2048) { \
block_size = (output_channels - 1) / 2 + 1; \
} else if (output_channels > 512 && output_channels <= 1024) { \
block_size = output_channels; \
} \
blocks = \
std::min(std::max(block_size / output_channels, 1), output_width); \
grid = dim3(ksize_width * ksize_height, output_height, batch_size); \
threads = dim3(std::min(output_channels, block_size), blocks, 1); \
} \
KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \
fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
if (c_filter != -1) { \
std::vector<int> perm_axis({2, 3, 0, 1}); \
math::TransposeNormal<platform::CUDADeviceContext, T> trans; \
trans(context, filter_grad_hwc, filter_grad, perm_axis); \
} \
} \
return; \
}
check_case
(
1
);
check_case
(
0
);
check_case
(
1
,
1
,
3
);
check_case
(
1
,
1
,
5
);
check_case
(
1
,
1
,
-
1
);
check_case
(
1
,
2
,
3
);
check_case
(
1
,
2
,
5
);
check_case
(
1
,
2
,
-
1
);
check_case
(
2
,
1
,
3
);
check_case
(
2
,
1
,
5
);
check_case
(
2
,
1
,
-
1
);
check_case
(
2
,
2
,
3
);
check_case
(
2
,
2
,
5
);
check_case
(
2
,
2
,
-
1
);
check_case
(
0
,
0
,
-
1
);
#undef check_case
}
};
...
...
python/paddle/nn/functional/conv.py
浏览文件 @
363b25aa
...
...
@@ -112,10 +112,6 @@ def _conv_nd(x,
# Due to the poor performance of NHWC, we transpose the input to NCHW.
origin_format
=
data_format
if
origin_format
==
"NHWC"
and
op_type
==
"depthwise_conv2d"
:
x
=
nn
.
transpose
(
x
,
perm
=
[
0
,
3
,
1
,
2
])
data_format
=
"NCHW"
channel_dim
=
1
if
in_dygraph_mode
():
attrs
=
(
'strides'
,
stride
,
'paddings'
,
padding
,
'dilations'
,
dilation
,
'groups'
,
groups
,
'use_cudnn'
,
use_cudnn
,
'use_mkldnn'
,
...
...
@@ -159,10 +155,6 @@ def _conv_nd(x,
'use_mkldnn'
:
use_mkldnn
})
else
:
out
=
pre_bias
if
origin_format
==
"NHWC"
and
op_type
==
"depthwise_conv2d"
:
out
=
nn
.
transpose
(
out
,
perm
=
[
0
,
2
,
3
,
1
])
return
out
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录