Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
bc47e7ac
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bc47e7ac
编写于
10月 24, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
10月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enhance the implementation of some conv functions. (#47281)
上级
2f3ad5ab
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
112 addition
and
132 deletion
+112
-132
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+28
-55
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+12
-5
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+17
-17
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+23
-27
paddle/phi/kernels/gpudnn/conv_kernel.cu
paddle/phi/kernels/gpudnn/conv_kernel.cu
+5
-6
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+25
-20
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+2
-2
未找到文件。
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
浏览文件 @
bc47e7ac
...
...
@@ -75,9 +75,9 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
SearchResult
<
AlgoT
>*
search_result
)
{
int
best_algo_idx
=
-
1
;
for
(
size_t
i
=
0
;
i
<
perf_results
.
size
();
++
i
)
{
auto
result
=
perf_results
[
i
];
const
auto
&
result
=
perf_results
[
i
];
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
result
.
memory
<
workspace_limit
)
{
result
.
memory
<
=
workspace_limit
)
{
if
(
best_algo_idx
==
-
1
)
{
// The algorithm which has minimize time cost and need a workspace_size
// fitting the workspace_limit constraint.
...
...
@@ -87,8 +87,10 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
break
;
}
}
else
{
float
best_algo_time
=
perf_results
[
best_algo_idx
].
time
;
if
((
result
.
time
-
best_algo_time
)
/
best_algo_time
<
0.01
)
{
// Compared to the next suboptimal algorithm, if the best one only has
// 1% performance difference, we'd like to pick the one which need less
// memory.
if
(
result
.
time
<
1.01
*
perf_results
[
best_algo_idx
].
time
)
{
best_algo_idx
=
(
result
.
memory
<
perf_results
[
best_algo_idx
].
memory
)
?
i
:
best_algo_idx
;
...
...
@@ -98,9 +100,15 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
if
(
best_algo_idx
!=
-
1
)
{
search_result
->
algo
=
perf_results
[
best_algo_idx
].
algo
;
search_result
->
time
=
perf_results
[
best_algo_idx
].
time
;
search_result
->
workspace_size
=
perf_results
[
best_algo_idx
].
memory
;
const
auto
&
result
=
perf_results
[
best_algo_idx
];
search_result
->
algo
=
result
.
algo
;
search_result
->
time
=
result
.
time
;
search_result
->
workspace_size
=
result
.
memory
;
auto
math_type_str
=
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
?
"T"
:
"F"
;
VLOG
(
3
)
<<
"Choose algo="
<<
result
.
algo
<<
", tensor_core="
<<
math_type_str
<<
", time="
<<
result
.
time
<<
" ms, memory="
<<
ToMegaBytes
(
result
.
memory
)
<<
" MB, status="
<<
result
.
status
;
}
else
{
VLOG
(
3
)
<<
"Can not find an algorithm that requires memory < "
<<
ToMegaBytes
(
workspace_limit
)
<<
" MB"
;
...
...
@@ -626,7 +634,8 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
perf_results
,
perf_results
.
size
(),
workspace_size_limit
);
ChooseAlgo
(
perf_results
,
workspace_size_limit
,
&
result
);
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
}
result
.
workspace_size
=
GetWorkspaceSize
(
args
,
result
.
algo
);
...
...
@@ -673,42 +682,6 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
return
workspace_size_limit
;
}
}
static
void
ChooseAlgo
(
const
std
::
vector
<
PerfT
>&
perf_results
,
size_t
workspace_limit
,
SearchResult
<
AlgoT
>*
algo_result
)
{
for
(
size_t
i
=
0
;
i
!=
perf_results
.
size
();
++
i
)
{
const
auto
&
result
=
perf_results
[
i
];
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
(
result
.
memory
<=
workspace_limit
))
{
if
((
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
&&
(
i
!=
perf_results
.
size
()
-
1
))
{
const
auto
&
next_result
=
perf_results
[
i
+
1
];
if
(
next_result
.
status
==
CUDNN_STATUS_SUCCESS
&&
next_result
.
algo
==
result
.
algo
&&
next_result
.
memory
==
result
.
memory
&&
next_result
.
mathType
!=
CUDNN_TENSOR_OP_MATH
&&
next_result
.
time
<
1.01
*
result
.
time
)
{
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue
;
}
}
algo_result
->
algo
=
result
.
algo
;
algo_result
->
time
=
result
.
time
;
auto
math_type_str
=
"0"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"1"
;
}
VLOG
(
3
)
<<
" choose algo: "
<<
result
.
algo
<<
", TC: "
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms, wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
break
;
}
}
}
};
template
<
typename
PerfT
>
...
...
@@ -735,7 +708,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// Auto tune is only enabled between specified range.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert
2
ConvCacheKey
<
T
>
();
auto
key
=
args
.
Convert
To
ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConv
(
SearchAlgorithmBase
<
PerfT
>::
kAlgoType
);
bool
find_in_cache
=
cache
.
Find
(
key
);
...
...
@@ -746,7 +719,6 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result
.
exhaustive_search
=
t
.
exhaustive_search
;
}
if
(
!
result
.
exhaustive_search
)
{
bool
need_update_cache
=
false
;
// In conv2d_tranpose, enable_autotune is set to false because some
// algorithm picked by exhaustive search method produce wrong result.
use_autotune
=
enable_autotune
&&
...
...
@@ -757,17 +729,18 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result
=
SearchAlgorithmBase
<
PerfT
>::
template
FindAlgoExhaustiveSearch
<
T
>(
args
,
ctx
);
need_update_cache
=
true
;
cache
.
Set
(
key
,
phi
::
autotune
::
ConvAutoTuneResult
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
true
));
}
else
if
(
!
find_in_cache
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoHeuristic
(
args
,
ctx
);
need_update_cache
=
true
;
}
if
(
need_update_cache
)
{
phi
::
autotune
::
ConvAutoTuneResult
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
exhaustive_search
||
use_autotune
);
cache
.
Set
(
key
,
node
);
cache
.
Set
(
key
,
phi
::
autotune
::
ConvAutoTuneResult
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
false
));
}
}
}
...
...
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
浏览文件 @
bc47e7ac
...
...
@@ -69,10 +69,15 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template
<
typename
HandleT
,
typename
DataT
>
struct
ConvArgsBase
{
HandleT
handle
;
paddle
::
platform
::
TensorDescriptor
idesc
,
odesc
;
paddle
::
platform
::
TensorDescriptor
idesc
;
paddle
::
platform
::
TensorDescriptor
odesc
;
paddle
::
platform
::
FilterDescriptor
wdesc
;
paddle
::
platform
::
ConvolutionDescriptor
cdesc
;
const
phi
::
DenseTensor
*
x
,
*
w
,
*
o
;
const
phi
::
DenseTensor
*
x
=
nullptr
;
const
phi
::
DenseTensor
*
w
=
nullptr
;
const
phi
::
DenseTensor
*
o
=
nullptr
;
DataT
cudnn_dtype
;
// strides
...
...
@@ -88,7 +93,8 @@ struct ConvArgsBase {
// data foramt
GPUDNNDataLayout
data_layout
;
ConvArgsBase
(
const
phi
::
DenseTensor
*
x
,
ConvArgsBase
(
const
HandleT
&
h
,
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
w
,
const
phi
::
DenseTensor
*
o
,
const
std
::
vector
<
int
>
s
,
...
...
@@ -97,7 +103,8 @@ struct ConvArgsBase {
DataT
dtype
,
int
g
,
GPUDNNDataLayout
layout
)
:
x
(
x
),
:
handle
(
h
),
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
...
...
@@ -108,7 +115,7 @@ struct ConvArgsBase {
data_layout
(
layout
)
{}
template
<
typename
T
>
phi
::
autotune
::
ConvCacheKey
Convert
2
ConvCacheKey
()
const
{
phi
::
autotune
::
ConvCacheKey
Convert
To
ConvCacheKey
()
const
{
auto
x_shape
=
phi
::
vectorize
(
x
->
dims
());
auto
w_shape
=
phi
::
vectorize
(
w
->
dims
());
VLOG
(
10
)
<<
"[ConvArgs] x_dims="
<<
x_shape
<<
", w_dims="
<<
w_shape
...
...
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -257,7 +257,8 @@ void ConvCudnnGradGradKernel(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
paddle
::
platform
::
DataLayout
::
kNCHW
);
ConvArgs
args1
{
&
transformed_ddX
,
ConvArgs
args1
{
handle
,
&
transformed_ddX
,
W
,
&
transformed_ddO_channel
,
strides
,
...
...
@@ -266,7 +267,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_X
,
ConvArgs
args2
{
handle
,
&
transformed_X
,
ddW
,
&
transformed_ddO_channel
,
strides
,
...
...
@@ -275,7 +277,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_ddX
,
ConvArgs
args3
{
handle
,
&
transformed_ddX
,
dW
,
&
transformed_dO_channel
,
strides
,
...
...
@@ -284,7 +287,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dX
,
ConvArgs
args4
{
handle
,
&
transformed_dX
,
ddW
,
&
transformed_dO_channel
,
strides
,
...
...
@@ -314,7 +318,6 @@ void ConvCudnnGradGradKernel(
ddy
=
ddO
->
data
<
T
>
();
transformed_ddy_channel
=
transformed_ddO_channel
.
data
<
T
>
();
if
(
ddX
)
{
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args1
.
wdesc
.
set
(
*
W
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
...
...
@@ -339,7 +342,6 @@ void ConvCudnnGradGradKernel(
if
(
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_X
,
iwo_group
);
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
...
...
@@ -367,7 +369,6 @@ void ConvCudnnGradGradKernel(
if
(
dW
&&
ddX
)
{
dw
=
dW
->
data
<
T
>
();
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dW
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
...
...
@@ -395,7 +396,6 @@ void ConvCudnnGradGradKernel(
if
(
ddW
&&
dX
)
{
transformed_dx
=
transformed_dX
.
data
<
T
>
();
args4
.
handle
=
handle
;
args4
.
idesc
.
set
(
transformed_dX
,
iwo_group
);
args4
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args4
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
...
...
@@ -444,13 +444,13 @@ void ConvCudnnGradGradKernel(
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
w
or
kspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
ddO
)
{
if
(
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionForward
(
...
...
@@ -471,7 +471,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -496,7 +496,7 @@ void ConvCudnnGradGradKernel(
if
(
ddW
)
{
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionForward
(
...
...
@@ -517,7 +517,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -547,7 +547,7 @@ void ConvCudnnGradGradKernel(
if
(
dW
&&
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
...
...
@@ -568,7 +568,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
...
...
@@ -594,7 +594,7 @@ void ConvCudnnGradGradKernel(
if
(
dX
&&
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionBackwardData
(
...
...
@@ -615,7 +615,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionBackwardData
(
...
...
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -251,12 +251,14 @@ void ConvCudnnGradKernel(const Context& ctx,
T
*
input_grad_data
=
nullptr
;
T
*
transformed_input_grad_data
=
nullptr
;
auto
handle
=
ctx
.
cudnn_handle
();
paddle
::
platform
::
DataLayout
layout
=
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
?
paddle
::
platform
::
DataLayout
::
kNHWC
:
paddle
::
platform
::
DataLayout
::
kNCHW
;
ConvArgs
args1
{
&
transformed_input_grad
,
ConvArgs
args1
{
handle
,
&
transformed_input_grad
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
strides
,
...
...
@@ -265,7 +267,8 @@ void ConvCudnnGradKernel(const Context& ctx,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_input
,
ConvArgs
args2
{
handle
,
&
transformed_input
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
...
...
@@ -275,7 +278,6 @@ void ConvCudnnGradKernel(const Context& ctx,
groups
,
layout
};
auto
handle
=
ctx
.
cudnn_handle
();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
if
(
transformed_input
.
dims
().
size
()
==
5
)
{
...
...
@@ -332,10 +334,7 @@ void ConvCudnnGradKernel(const Context& ctx,
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
// input data workspace_size
size_t
workspace_size_d
=
0
;
// weight workspace_size
size_t
workspace_size_w
=
0
;
size_t
workspace_size
=
0
;
int
iwo_groups
=
groups
;
int
c_groups
=
1
;
...
...
@@ -350,7 +349,6 @@ void ConvCudnnGradKernel(const Context& ctx,
input_grad_data
=
input_grad
->
data
<
T
>
();
transformed_input_grad_data
=
transformed_input_grad
.
data
<
T
>
();
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_input_grad
,
layout_tensor
);
args1
.
wdesc
.
set
(
transformed_filter_channel
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
...
...
@@ -363,21 +361,20 @@ void ConvCudnnGradKernel(const Context& ctx,
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size_d
=
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size
_d
,
ctx
);
args1
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
deterministic
);
workspace_size
_d
=
std
::
max
(
workspace_size_d
,
bwd_result
.
workspace_size
);
workspace_size
=
std
::
max
(
workspace_size
,
bwd_result
.
workspace_size
);
#endif
}
if
(
filter_grad
)
{
// ------------------- cudnn descriptors ---------------------
filter_grad_data
=
transformed_filter_grad_channel
.
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_input
,
layout_tensor
);
args2
.
wdesc
.
set
(
transformed_filter_grad_channel
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
...
...
@@ -389,17 +386,16 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size
_w
,
ctx
);
args2
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
deterministic
);
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
<<
filter_result
.
time
;
workspace_size
_w
=
std
::
max
(
workspace_size_w
,
filter_result
.
workspace_size
);
workspace_size
=
std
::
max
(
workspace_size
,
filter_result
.
workspace_size
);
#endif
}
...
...
@@ -438,9 +434,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
idesc
.
desc
(),
temp_tensor_data
,
cudnn_workspace_ptr
,
workspace_size
_d
));
workspace_size
));
},
workspace_size
_d
);
workspace_size
);
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenOpTensor
(
handle
,
miopenTensorOpAdd
,
...
...
@@ -470,9 +466,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
,
cudnn_workspace_ptr
,
workspace_size
_d
));
workspace_size
));
},
workspace_size
_d
);
workspace_size
);
}
#else
...
...
@@ -490,12 +486,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
cdesc
.
desc
(),
bwd_result
.
algo
,
cudnn_workspace_ptr
,
workspace_size
_d
,
workspace_size
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
+
i
*
group_offset_in
));
},
workspace_size
_d
);
workspace_size
);
}
#endif
if
(
!
is_sys_pad
)
{
...
...
@@ -551,9 +547,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
wdesc
.
desc
(),
filter_grad_data
,
cudnn_workspace_ptr
,
workspace_size
_w
));
workspace_size
));
},
workspace_size
_w
);
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
...
...
@@ -569,12 +565,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
cdesc
.
desc
(),
filter_result
.
algo
,
cudnn_workspace_ptr
,
workspace_size
_w
,
workspace_size
,
&
beta_filter
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
i
*
group_offset_filter
));
},
workspace_size
_w
);
workspace_size
);
}
#endif
...
...
paddle/phi/kernels/gpudnn/conv_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -201,11 +201,14 @@ void ConvCudnnKernel(const Context& ctx,
}
const
T
*
input_data
=
transformed_input
.
data
<
T
>
();
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
// ------------------- cudnn descriptors ---------------------
ConvArgs
args
{
&
transformed_input
,
ConvArgs
args
{
handle
,
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_output
,
strides
,
...
...
@@ -215,8 +218,6 @@ void ConvCudnnKernel(const Context& ctx,
groups
,
compute_format
};
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
paddle
::
platform
::
DataLayout
layout
=
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
?
paddle
::
platform
::
DataLayout
::
kNHWC
...
...
@@ -228,8 +229,6 @@ void ConvCudnnKernel(const Context& ctx,
}
auto
layout_format
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
args
.
handle
=
handle
;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args
.
cdesc
.
set
(
dtype
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -172,8 +172,10 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#endif
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
auto
handle
=
ctx
.
cudnn_handle
();
ConvArgs
args1
{
&
transformed_dout
,
ConvArgs
args1
{
handle
,
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
...
...
@@ -182,7 +184,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_dout
,
ConvArgs
args2
{
handle
,
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
...
...
@@ -202,14 +205,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
size_t
workspace_size
=
0
;
auto
handle
=
ctx
.
cudnn_handle
();
bool
deterministic
=
FLAGS_cudnn_deterministic
;
T
*
dx_data
=
nullptr
;
T
*
dfilter_data
=
nullptr
;
if
(
dx
)
{
dx_data
=
ctx
.
template
Alloc
<
T
>(
dx
);
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_dout
,
iwo_groups
);
args1
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
x_transpose
,
iwo_groups
);
...
...
@@ -234,7 +236,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
if
(
dfilter
)
{
dfilter_data
=
ctx
.
template
Alloc
<
T
>(
dfilter
);
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_dout
,
iwo_groups
);
args2
.
wdesc
.
set
(
*
dfilter
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
x_transpose
,
iwo_groups
);
...
...
@@ -625,7 +627,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto
handle
=
ctx
.
cudnn_handle
();
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
ConvArgs
args1
{
&
transformed_ddout_channel
,
ConvArgs
args1
{
handle
,
&
transformed_ddout_channel
,
&
filter
,
&
transformed_ddx
,
strides
,
...
...
@@ -634,7 +637,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_ddout_channel
,
ConvArgs
args2
{
handle
,
&
transformed_ddout_channel
,
&
ddfilter
,
&
transformed_x
,
strides
,
...
...
@@ -644,7 +648,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_dout
,
ConvArgs
args3
{
handle
,
&
transformed_dout
,
dfilter
,
&
transformed_ddx_channel
,
strides
,
...
...
@@ -653,7 +658,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dout
,
ConvArgs
args4
{
handle
,
&
transformed_dout
,
&
ddfilter
,
&
transformed_dx_channel
,
strides
,
...
...
@@ -683,7 +689,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddout_
=
ddout
->
data
<
T
>
();
transformed_ddout_channel_
=
transformed_ddout_channel
.
data
<
T
>
();
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_ddout_channel
,
iwo_group
);
args1
.
wdesc
.
set
(
filter
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddx
,
iwo_group
);
...
...
@@ -730,7 +735,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
if
(
dfilter
)
{
dfilter_
=
dfilter
->
data
<
T
>
();
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_dout
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dfilter
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_ddx_channel
,
iwo_group
);
...
...
@@ -806,13 +811,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
w
or
kspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
ddout
)
{
ddx_
=
transformed_ddx
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardData
(
handle
,
...
...
@@ -831,7 +836,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardData
(
handle
,
...
...
@@ -858,7 +863,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
DenseTensor
conv_x_ddfilter
(
dout
.
type
());
conv_x_ddfilter
.
Resize
(
transformed_ddout_channel
.
dims
());
T
*
conv_x_ddfilter_data
=
ctx
.
template
Alloc
<
T
>(
&
conv_x_ddfilter
);
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardData
(
handle
,
...
...
@@ -889,7 +894,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2
.
idesc
.
desc
(),
transformed_ddout_channel_
+
i
*
group_offset_out
));
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardData
(
handle
,
...
...
@@ -944,7 +949,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddx_
=
transformed_ddx_channel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardWeights
(
...
...
@@ -964,7 +969,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
...
...
@@ -990,7 +995,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddfilter_
=
ddfilter
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionForward
(
handle
,
...
...
@@ -1009,7 +1014,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionForward
(
handle
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -199,7 +199,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
// ------------------- cudnn descriptors ---------------------
ConvArgs
args
{
&
transformed_out
,
ConvArgs
args
{
handle
,
&
transformed_out
,
&
filter
,
&
transformed_x
,
strides
,
...
...
@@ -208,7 +209,6 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
dtype
,
groups
,
data_layout
};
args
.
handle
=
handle
;
args
.
idesc
.
set
(
transformed_out
,
iwo_groups
);
args
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
args
.
odesc
.
set
(
transformed_x
,
iwo_groups
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录