Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ed2a1852
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ed2a1852
编写于
11月 24, 2019
作者:
G
gongweibao
提交者:
GitHub
11月 24, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimize nhwc for tensor core in ConvOp and ConvGradOp (#20597)
上级
c918788b
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
329 addition
and
75 deletion
+329
-75
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+191
-64
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+8
-6
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+30
-0
paddle/fluid/platform/cudnn_desc.h
paddle/fluid/platform/cudnn_desc.h
+45
-3
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+55
-2
未找到文件。
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
ed2a1852
...
...
@@ -40,6 +40,10 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
static
inline
bool
IsVoltaOrLater
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
return
dev_ctx
.
GetComputeCapability
()
>=
70
;
}
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -68,11 +72,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
(
data_format
==
"NHWC"
||
data_format
==
"NDHWC"
);
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
// Tensor Core introduced from Volta GPUs supports more faster conv op
// with FP16 in NHWC data format.
const
bool
compute_in_nhwc
=
dtype
==
CUDNN_DATA_HALF
&&
IsVoltaOrLater
(
dev_ctx
);
// We will only do data format conversion from NHWC to NCHW.
// cudnn will convert NCHW to NHWC automatically on Tensor Core.
auto
compute_format
=
compute_in_nhwc
&&
channel_last
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
VLOG
(
3
)
<<
"Compute ConvOp with cuDNN:"
<<
" data_format="
<<
data_format
<<
" compute_format="
<<
(
compute_format
==
DataLayout
::
kNHWC
?
"NHWC"
:
"NCHW"
);
// ------------ transformed tensor -----------
Tensor
transformed_input_channel
(
input
->
type
());
Tensor
transformed_output
(
output
->
type
());
Tensor
transformed_filter_channel
(
filter
->
type
());
T
*
output_data
=
nullptr
;
if
(
channel_last
)
{
if
(
channel_last
&&
compute_format
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform input tensor from NHWC to NCHW."
;
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
input
,
&
transformed_input_channel
);
TransToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -82,19 +102,36 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
&
transformed_output
);
}
else
{
transformed_input_channel
=
*
input
;
transformed_output
=
*
output
;
transformed_input_channel
.
ShareDataWith
(
*
input
);
transformed_output
.
ShareDataWith
(
*
output
);
}
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
VLOG
(
3
)
<<
"Transform filter tensor from NCHW to NHWC."
;
ResizeToChannelLast
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
filter
,
&
transformed_filter_channel
);
TransToChannelLast
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
filter
,
&
transformed_filter_channel
);
}
else
{
transformed_filter_channel
.
ShareDataWith
(
*
filter
);
}
output_data
=
transformed_output
.
data
<
T
>
();
// update padding and dilation
auto
in_dims
=
transformed_input_channel
.
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
filter_dims
=
transformed_filter_channel
.
dims
();
framework
::
DDim
in_data_dims
;
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
())
;
framework
::
DDim
filter_data_dims
;
framework
::
DDim
filter_data_dims
=
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
}
else
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
1
,
filter_dims
.
size
()
-
1
);
}
std
::
vector
<
int
>
ksize
=
framework
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
...
...
@@ -108,17 +145,33 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
padding_diff
(
data_dim
);
std
::
vector
<
int
>
new_input_shape_vec
(
data_dim
+
2
);
new_input_shape_vec
[
0
]
=
transformed_input_channel
.
dims
()[
0
];
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
new_input_shape_vec
[
1
]
=
transformed_input_channel
.
dims
()[
1
];
}
else
{
new_input_shape_vec
[
data_dim
+
1
]
=
transformed_input_channel
.
dims
()[
data_dim
+
1
];
}
std
::
vector
<
int
>
input_pad
(
transformed_input_channel
.
dims
().
size
()
*
2
,
0
);
for
(
size_t
i
=
0
;
i
<
data_dim
;
++
i
)
{
padding_diff
[
i
]
=
std
::
abs
(
paddings
[
2
*
i
]
-
paddings
[
2
*
i
+
1
]);
padding_common
[
i
]
=
std
::
min
(
paddings
[
2
*
i
],
paddings
[
2
*
i
+
1
]);
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
new_input_shape_vec
[
i
+
2
]
=
transformed_input_channel
.
dims
()[
i
+
2
]
+
padding_diff
[
i
];
}
else
{
new_input_shape_vec
[
i
+
1
]
=
transformed_input_channel
.
dims
()[
i
+
1
]
+
padding_diff
[
i
];
}
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
input_pad
[
2
*
i
+
4
]
=
paddings
[
2
*
i
]
-
padding_common
[
i
];
input_pad
[
2
*
i
+
4
+
1
]
=
paddings
[
2
*
i
+
1
]
-
padding_common
[
i
];
}
else
{
input_pad
[
2
*
i
+
2
]
=
paddings
[
2
*
i
]
-
padding_common
[
i
];
input_pad
[
2
*
i
+
2
+
1
]
=
paddings
[
2
*
i
+
1
]
-
padding_common
[
i
];
}
}
framework
::
DDim
new_input_shape
(
framework
::
make_ddim
(
new_input_shape_vec
));
...
...
@@ -147,7 +200,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
}
}
else
{
transformed_input
=
transformed_input_channel
;
transformed_input
.
ShareDataWith
(
transformed_input_channel
)
;
if
(
paddings
.
size
()
==
data_dim
)
{
for
(
size_t
i
=
0
;
i
<
data_dim
;
++
i
)
{
padding_common
[
i
]
=
paddings
[
i
];
...
...
@@ -160,18 +213,20 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
}
const
T
*
input_data
=
transformed_input
.
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
// ------------------- cudnn descriptors ---------------------
ConvArgs
args
{
&
transformed_input
,
filter
,
&
transformed_output
,
strides
,
ConvArgs
args
{
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_output
,
strides
,
padding_common
,
dilations
};
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
DataLayout
layout
=
DataLayout
::
kNCHW
;
if
(
transformed_input_channel
.
dims
().
size
()
==
5
)
{
layout
=
DataLayout
::
kNCDHW
;
DataLayout
layout
=
compute_format
==
DataLayout
::
kNHWC
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
if
(
transformed_input
.
dims
().
size
()
==
5
)
{
layout
=
compute_format
==
DataLayout
::
kNHWC
?
DataLayout
::
kNDHWC
:
DataLayout
::
kNCDHW
;
}
auto
layout_format
=
GetCudnnTensorFormat
(
layout
);
...
...
@@ -186,21 +241,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
args
.
cdesc
.
desc
(),
groups
));
groups
=
1
;
#endif
args
.
idesc
.
set
(
transformed_input
,
groups
);
args
.
wdesc
.
set
(
*
filter
,
layout_format
,
groups
);
args
.
odesc
.
set
(
transformed_output
,
groups
);
args
.
idesc
.
set
(
transformed_input
,
layout_format
);
args
.
wdesc
.
set
(
transformed_filter_channel
,
layout_format
,
groups
);
args
.
odesc
.
set
(
transformed_output
,
layout_format
);
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
GetNCDHW
(
transformed_input
.
dims
(),
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output
.
dims
(),
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
else
{
GetNCDHW
(
transformed_input
.
dims
(),
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
GetNCDHW
(
transformed_output
.
dims
(),
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_filter
=
filter
->
numel
()
/
groups
;
int
group_offset_filter
=
transformed_filter_channel
.
numel
()
/
groups
;
// ------------------- cudnn conv workspace ---------------------
size_t
workspace_size
=
0
;
// final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
...
...
@@ -225,7 +286,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size
);
}
if
(
channel_last
)
{
if
(
channel_last
&&
compute_format
==
DataLayout
::
kNCHW
)
{
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
&
transformed_output
,
output
);
}
...
...
@@ -245,7 +306,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
const
T
*
filter_data
=
filter
->
data
<
T
>
();
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
...
...
@@ -269,12 +329,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
(
data_format
==
"NHWC"
||
data_format
==
"NDHWC"
);
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
const
bool
compute_in_nhwc
=
dtype
==
CUDNN_DATA_HALF
&&
IsVoltaOrLater
(
dev_ctx
);
auto
compute_format
=
compute_in_nhwc
&&
channel_last
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
VLOG
(
3
)
<<
"Compute ConvGradOp with cuDNN:"
<<
" data_format="
<<
data_format
<<
" compute_format="
<<
(
compute_format
==
DataLayout
::
kNHWC
?
"NHWC"
:
"NCHW"
);
// transform Tensor
Tensor
transformed_input_channel
(
input
->
type
());
Tensor
transformed_output_grad_channel
(
output_grad
->
type
());
Tensor
transformed_input_grad_channel
(
input
->
type
());
Tensor
transformed_filter_channel
(
filter
->
type
());
Tensor
transformed_filter_grad_channel
(
filter
->
type
());
if
(
channel_last
)
{
if
(
channel_last
&&
compute_format
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform input, output_grad, input_grad and tensor from "
"NHWC to NCHW."
;
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
input
,
&
transformed_input_channel
);
TransToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -289,22 +362,46 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
input_grad
,
&
transformed_input_grad_channel
);
}
}
else
{
transformed_input_channel
=
*
input
;
transformed_output_grad_channel
=
*
output_grad
;
transformed_input_channel
.
ShareDataWith
(
*
input
)
;
transformed_output_grad_channel
.
ShareDataWith
(
*
output_grad
)
;
if
(
input_grad
)
{
transformed_input_grad_channel
.
ShareDataWith
(
*
input_grad
);
}
}
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
VLOG
(
3
)
<<
"Transform filter and filter_grad tensor from NCHW to NHWC."
;
ResizeToChannelLast
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
filter
,
&
transformed_filter_channel
);
TransToChannelLast
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
filter
,
&
transformed_filter_channel
);
if
(
filter_grad
)
{
ResizeToChannelLast
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
filter_grad
,
&
transformed_filter_grad_channel
);
}
}
else
{
transformed_filter_channel
.
ShareDataWith
(
*
filter
);
if
(
filter_grad
)
{
transformed_filter_grad_channel
.
ShareDataWith
(
*
filter_grad
);
}
}
// update paddings
auto
in_dims
=
transformed_input_channel
.
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
filter_dims
=
transformed_filter_channel
.
dims
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
framework
::
DDim
filter_data_dims
=
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
}
else
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
1
,
filter_dims
.
size
()
-
1
);
}
std
::
vector
<
int
>
ksize
=
framework
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
...
...
@@ -323,15 +420,30 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
padding_diff
(
data_dim
);
std
::
vector
<
int
>
new_input_shape_vec
(
data_dim
+
2
);
new_input_shape_vec
[
0
]
=
transformed_input_channel
.
dims
()[
0
];
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
new_input_shape_vec
[
1
]
=
transformed_input_channel
.
dims
()[
1
];
}
else
{
new_input_shape_vec
[
data_dim
+
1
]
=
transformed_input_channel
.
dims
()[
data_dim
+
1
];
}
for
(
size_t
i
=
0
;
i
<
data_dim
;
++
i
)
{
padding_diff
[
i
]
=
std
::
abs
(
paddings
[
2
*
i
]
-
paddings
[
2
*
i
+
1
]);
padding_common
[
i
]
=
std
::
min
(
paddings
[
2
*
i
],
paddings
[
2
*
i
+
1
]);
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
new_input_shape_vec
[
i
+
2
]
=
transformed_input_channel
.
dims
()[
i
+
2
]
+
padding_diff
[
i
];
}
else
{
new_input_shape_vec
[
i
+
1
]
=
transformed_input_channel
.
dims
()[
i
+
1
]
+
padding_diff
[
i
];
}
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
input_pad
[
2
*
i
+
4
]
=
paddings
[
2
*
i
]
-
padding_common
[
i
];
input_pad
[
2
*
i
+
4
+
1
]
=
paddings
[
2
*
i
+
1
]
-
padding_common
[
i
];
}
else
{
input_pad
[
2
*
i
+
2
]
=
paddings
[
2
*
i
]
-
padding_common
[
i
];
input_pad
[
2
*
i
+
2
+
1
]
=
paddings
[
2
*
i
+
1
]
-
padding_common
[
i
];
}
}
framework
::
DDim
new_input_shape
(
framework
::
make_ddim
(
new_input_shape_vec
));
...
...
@@ -384,42 +496,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
const
T
*
input_data
=
transformed_input
.
data
<
T
>
();
const
T
*
output_grad_data
=
transformed_output_grad_channel
.
data
<
T
>
();
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
T
*
filter_grad_data
=
nullptr
;
T
*
input_grad_data
=
nullptr
;
T
*
transformed_input_grad_data
=
nullptr
;
ConvArgs
args1
{
&
transformed_input_grad
,
filter
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
};
ConvArgs
args2
{
&
transformed_input
,
filter_grad
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
};
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
DataLayout
layout
=
DataLayout
::
kNCHW
;
if
(
input
->
dims
().
size
()
==
5
)
{
layout
=
DataLayout
::
kNCDHW
;
DataLayout
layout
=
compute_format
==
DataLayout
::
kNHWC
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
if
(
transformed_input
.
dims
().
size
()
==
5
)
{
layout
=
compute_format
==
DataLayout
::
kNHWC
?
DataLayout
::
kNDHWC
:
DataLayout
::
kNCDHW
;
}
auto
layout_tensor
=
GetCudnnTensorFormat
(
layout
);
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
GetNCDHW
(
transformed_input
.
dims
(),
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
else
{
GetNCDHW
(
transformed_input
.
dims
(),
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_filter
=
filter
->
numel
()
/
groups
;
int
group_offset_filter
=
transformed_filter_channel
.
numel
()
/
groups
;
// ------------------- cudnn backward algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t
data_algo
=
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
...
...
@@ -439,9 +560,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_grad_data
=
input_grad
->
data
<
T
>
();
transformed_input_grad_data
=
transformed_input_grad
.
data
<
T
>
();
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_input_grad
,
iwo_groups
);
args1
.
wdesc
.
set
(
*
filter
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
transformed_output_grad_channel
,
iwo_groups
);
args1
.
idesc
.
set
(
transformed_input_grad
,
layout_tensor
);
args1
.
wdesc
.
set
(
transformed_filter_channel
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
...
...
@@ -453,11 +574,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
// ------------------- cudnn descriptors ---------------------
filter_grad_data
=
filter_grad
->
data
<
T
>
();
filter_grad_data
=
transformed_filter_grad_channel
.
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_input
,
iwo_groups
);
args2
.
wdesc
.
set
(
*
filter_grad
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
transformed_output_grad_channel
,
iwo_groups
);
args2
.
idesc
.
set
(
transformed_input
,
layout_tensor
);
args2
.
wdesc
.
set
(
transformed_filter_grad_channel
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
...
...
@@ -506,7 +628,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
}
if
(
channel_last
)
{
if
(
channel_last
&&
compute_format
==
DataLayout
::
kNCHW
)
{
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
&
transformed_input_grad_channel
,
input_grad
);
}
...
...
@@ -527,6 +649,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
},
workspace_size
);
}
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
TransToChannelFirst
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
&
transformed_filter_grad_channel
,
filter_grad
);
}
}
}
};
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
ed2a1852
...
...
@@ -97,13 +97,15 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
filter_dims
[
0
],
filter_dims
,
groups
);
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
framework
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
framework
::
DDim
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
filter_data_dims
=
framework
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
std
::
vector
<
int
>
ksize
=
framework
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
...
...
@@ -117,9 +119,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
(
in_data_dims
[
i
]
<=
0
||
filter_dims
[
i
+
2
]
<=
0
))
{
output_shape
.
push_back
(
-
1
);
}
else
{
output_shape
.
push_back
(
ConvOutputSize
(
in_data_dims
[
i
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
2
*
i
],
paddings
[
2
*
i
+
1
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_data_dims
[
i
],
filter_data_dims
[
i
],
dilations
[
i
],
paddings
[
2
*
i
],
paddings
[
2
*
i
+
1
],
strides
[
i
]));
}
}
if
(
channel_last
)
{
...
...
@@ -335,7 +337,7 @@ parameters is checked in the infer-shape.
Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature.
Filters(Input) is MCHW format. Where M is the number of output image channels, C is
Filters(Input) is MCHW format
format
. Where M is the number of output image channels, C is
the number of input image channels, H is the height of the filter, and W
is the width of the filter.
Parameters(strides, paddings, dilations) are two elements. These two elements represent
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
ed2a1852
...
...
@@ -154,6 +154,36 @@ inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ResizeToChannelLast
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
transformed_input
)
{
int
dim
=
input
->
dims
().
size
()
-
2
;
if
(
dim
==
3
)
{
// input
transformed_input
->
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
2
];
in_dims_vec
[
2
]
=
input
->
dims
()[
3
];
in_dims_vec
[
3
]
=
input
->
dims
()[
4
];
in_dims_vec
[
4
]
=
input
->
dims
()[
1
];
transformed_input
->
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
else
if
(
dim
==
2
)
{
// input
transformed_input
->
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
2
];
in_dims_vec
[
2
]
=
input
->
dims
()[
3
];
in_dims_vec
[
3
]
=
input
->
dims
()[
1
];
transformed_input
->
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
TransToChannelFirst
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
...
...
paddle/fluid/platform/cudnn_desc.h
浏览文件 @
ed2a1852
...
...
@@ -34,6 +34,29 @@ inline cudnnDataType_t ToCudnnDataType(const T& t) {
return
ToCudnnDataType
(
type
);
}
inline
std
::
vector
<
int
>
TransformDimOrder
(
const
std
::
vector
<
int
>&
dims
)
{
std
::
vector
<
int
>
transformed_dims
(
dims
.
begin
(),
dims
.
end
());
int
H
,
W
,
D
,
C
;
if
(
dims
.
size
()
==
4
)
{
H
=
dims
[
1
];
W
=
dims
[
2
];
C
=
dims
[
3
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
H
;
transformed_dims
[
3
]
=
W
;
}
else
{
D
=
dims
[
1
];
H
=
dims
[
2
];
W
=
dims
[
3
];
C
=
dims
[
4
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
D
;
transformed_dims
[
3
]
=
H
;
transformed_dims
[
4
]
=
W
;
}
return
transformed_dims
;
}
template
<
>
inline
cudnnDataType_t
ToCudnnDataType
(
const
framework
::
proto
::
VarType
::
Type
&
t
)
{
...
...
@@ -117,6 +140,19 @@ class TensorDescriptor {
dims_with_group
.
data
(),
strides
.
data
()));
}
void
set
(
const
Tensor
&
tensor
,
const
cudnnTensorFormat_t
format
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
if
(
format
==
CUDNN_TENSOR_NHWC
)
{
transformed_dims
=
TransformDimOrder
(
dims
);
}
else
{
transformed_dims
=
dims
;
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetTensorNdDescriptorEx
(
desc_
.
get
(),
format
,
ToCudnnDataType
(
tensor
.
type
()),
transformed_dims
.
size
(),
transformed_dims
.
data
()));
}
private:
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
...
...
@@ -143,12 +179,18 @@ class FilterDescriptor {
void
set
(
const
Tensor
&
tensor
,
const
cudnnTensorFormat_t
format
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
if
(
format
==
CUDNN_TENSOR_NHWC
)
{
transformed_dims
=
TransformDimOrder
(
dims
);
}
else
{
transformed_dims
=
dims
;
}
if
(
groups
>
1
)
{
dims
[
1
]
=
dims
[
1
]
/
groups
;
transformed_dims
[
1
]
=
transformed_
dims
[
1
]
/
groups
;
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetFilterNdDescriptor
(
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
format
,
dims
.
size
(),
dims
.
data
()));
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
format
,
transformed_dims
.
size
(),
transformed_
dims
.
data
()));
}
private:
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
ed2a1852
...
...
@@ -81,7 +81,6 @@ def conv2d_forward_naive(input,
if
len
(
pad
)
==
4
:
pad_h_0
,
pad_h_1
=
pad
[
0
],
pad
[
1
]
pad_w_0
,
pad_w_1
=
pad
[
2
],
pad
[
3
]
out_h
=
1
+
(
in_h
+
pad_h_0
+
pad_h_1
-
(
dilation
[
0
]
*
(
f_h
-
1
)
+
1
))
//
stride
[
0
]
out_w
=
1
+
(
in_w
+
pad_w_0
+
pad_w_1
-
(
dilation
[
1
]
*
...
...
@@ -204,6 +203,50 @@ def create_test_cudnn_channel_last_class(parent):
globals
()[
cls_name
]
=
TestCudnnChannelLastCase
def
create_test_cudnn_channel_last_fp16_class
(
parent
,
grad_check
=
True
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestCudnnChannelLastFp16
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
def
test_check_grad_no_filter
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
)
and
grad_check
:
self
.
check_grad_with_place
(
place
,
[
'Input'
],
'Output'
,
max_relative_error
=
0.02
,
no_grad_set
=
set
([
'Filter'
]))
def
test_check_grad_no_input
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
)
and
grad_check
:
self
.
check_grad_with_place
(
place
,
[
'Filter'
],
'Output'
,
max_relative_error
=
0.02
,
no_grad_set
=
set
([
'Input'
]))
def
init_data_format
(
self
):
self
.
data_format
=
"NHWC"
def
init_test_case_2
(
self
):
N
,
C
,
H
,
W
=
self
.
input_size
self
.
input_size
=
[
N
,
H
,
W
,
C
]
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CudnnChannelLastFp16"
)
TestCudnnChannelLastFp16
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestCudnnChannelLastFp16
def
create_test_padding_SAME_class
(
parent
):
class
TestPaddingSMAECase
(
parent
):
def
init_paddings
(
self
):
...
...
@@ -699,7 +742,6 @@ class TestConv2dOp_v2(OpTest):
self
.
init_dilation
()
self
.
init_data_format
()
self
.
init_test_case
()
self
.
init_paddings
()
self
.
init_test_case_2
()
...
...
@@ -1195,6 +1237,17 @@ create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
create_test_cudnn_channel_last_class
(
TestWithGroup_AsyPadding
)
create_test_cudnn_channel_last_class
(
TestWithDilation_AsyPadding
)
create_test_cudnn_channel_last_fp16_class
(
TestConv2dOp_AsyPadding
,
grad_check
=
False
)
create_test_cudnn_channel_last_fp16_class
(
TestWithPad_AsyPadding
,
grad_check
=
False
)
create_test_cudnn_channel_last_fp16_class
(
TestWithStride_AsyPadding
,
grad_check
=
False
)
create_test_cudnn_channel_last_fp16_class
(
TestWithGroup_AsyPadding
,
grad_check
=
False
)
create_test_cudnn_channel_last_fp16_class
(
TestWithDilation_AsyPadding
,
grad_check
=
False
)
# --------- test python API ---------------
class
TestConv2dAPI
(
OpTest
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录