Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
bc47e7ac
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bc47e7ac
编写于
10月 24, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
10月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enhance the implementation of some conv functions. (#47281)
上级
2f3ad5ab
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
112 addition
and
132 deletion
+112
-132
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+28
-55
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+12
-5
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+17
-17
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+23
-27
paddle/phi/kernels/gpudnn/conv_kernel.cu
paddle/phi/kernels/gpudnn/conv_kernel.cu
+5
-6
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+25
-20
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+2
-2
未找到文件。
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
浏览文件 @
bc47e7ac
...
...
@@ -75,9 +75,9 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
SearchResult
<
AlgoT
>*
search_result
)
{
int
best_algo_idx
=
-
1
;
for
(
size_t
i
=
0
;
i
<
perf_results
.
size
();
++
i
)
{
auto
result
=
perf_results
[
i
];
const
auto
&
result
=
perf_results
[
i
];
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
result
.
memory
<
workspace_limit
)
{
result
.
memory
<
=
workspace_limit
)
{
if
(
best_algo_idx
==
-
1
)
{
// The algorithm which has minimize time cost and need a workspace_size
// fitting the workspace_limit constraint.
...
...
@@ -87,8 +87,10 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
break
;
}
}
else
{
float
best_algo_time
=
perf_results
[
best_algo_idx
].
time
;
if
((
result
.
time
-
best_algo_time
)
/
best_algo_time
<
0.01
)
{
// Compared to the next suboptimal algorithm, if the best one only has
// 1% performance difference, we'd like to pick the one which need less
// memory.
if
(
result
.
time
<
1.01
*
perf_results
[
best_algo_idx
].
time
)
{
best_algo_idx
=
(
result
.
memory
<
perf_results
[
best_algo_idx
].
memory
)
?
i
:
best_algo_idx
;
...
...
@@ -98,9 +100,15 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
if
(
best_algo_idx
!=
-
1
)
{
search_result
->
algo
=
perf_results
[
best_algo_idx
].
algo
;
search_result
->
time
=
perf_results
[
best_algo_idx
].
time
;
search_result
->
workspace_size
=
perf_results
[
best_algo_idx
].
memory
;
const
auto
&
result
=
perf_results
[
best_algo_idx
];
search_result
->
algo
=
result
.
algo
;
search_result
->
time
=
result
.
time
;
search_result
->
workspace_size
=
result
.
memory
;
auto
math_type_str
=
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
?
"T"
:
"F"
;
VLOG
(
3
)
<<
"Choose algo="
<<
result
.
algo
<<
", tensor_core="
<<
math_type_str
<<
", time="
<<
result
.
time
<<
" ms, memory="
<<
ToMegaBytes
(
result
.
memory
)
<<
" MB, status="
<<
result
.
status
;
}
else
{
VLOG
(
3
)
<<
"Can not find an algorithm that requires memory < "
<<
ToMegaBytes
(
workspace_limit
)
<<
" MB"
;
...
...
@@ -626,7 +634,8 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
perf_results
,
perf_results
.
size
(),
workspace_size_limit
);
ChooseAlgo
(
perf_results
,
workspace_size_limit
,
&
result
);
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
}
result
.
workspace_size
=
GetWorkspaceSize
(
args
,
result
.
algo
);
...
...
@@ -673,42 +682,6 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
return
workspace_size_limit
;
}
}
static
void
ChooseAlgo
(
const
std
::
vector
<
PerfT
>&
perf_results
,
size_t
workspace_limit
,
SearchResult
<
AlgoT
>*
algo_result
)
{
for
(
size_t
i
=
0
;
i
!=
perf_results
.
size
();
++
i
)
{
const
auto
&
result
=
perf_results
[
i
];
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
(
result
.
memory
<=
workspace_limit
))
{
if
((
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
&&
(
i
!=
perf_results
.
size
()
-
1
))
{
const
auto
&
next_result
=
perf_results
[
i
+
1
];
if
(
next_result
.
status
==
CUDNN_STATUS_SUCCESS
&&
next_result
.
algo
==
result
.
algo
&&
next_result
.
memory
==
result
.
memory
&&
next_result
.
mathType
!=
CUDNN_TENSOR_OP_MATH
&&
next_result
.
time
<
1.01
*
result
.
time
)
{
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue
;
}
}
algo_result
->
algo
=
result
.
algo
;
algo_result
->
time
=
result
.
time
;
auto
math_type_str
=
"0"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"1"
;
}
VLOG
(
3
)
<<
" choose algo: "
<<
result
.
algo
<<
", TC: "
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms, wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
break
;
}
}
}
};
template
<
typename
PerfT
>
...
...
@@ -735,7 +708,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// Auto tune is only enabled between specified range.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert
2
ConvCacheKey
<
T
>
();
auto
key
=
args
.
Convert
To
ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConv
(
SearchAlgorithmBase
<
PerfT
>::
kAlgoType
);
bool
find_in_cache
=
cache
.
Find
(
key
);
...
...
@@ -746,7 +719,6 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result
.
exhaustive_search
=
t
.
exhaustive_search
;
}
if
(
!
result
.
exhaustive_search
)
{
bool
need_update_cache
=
false
;
// In conv2d_tranpose, enable_autotune is set to false because some
// algorithm picked by exhaustive search method produce wrong result.
use_autotune
=
enable_autotune
&&
...
...
@@ -757,17 +729,18 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result
=
SearchAlgorithmBase
<
PerfT
>::
template
FindAlgoExhaustiveSearch
<
T
>(
args
,
ctx
);
need_update_cache
=
true
;
cache
.
Set
(
key
,
phi
::
autotune
::
ConvAutoTuneResult
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
true
));
}
else
if
(
!
find_in_cache
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoHeuristic
(
args
,
ctx
);
need_update_cache
=
true
;
}
if
(
need_update_cache
)
{
phi
::
autotune
::
ConvAutoTuneResult
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
exhaustive_search
||
use_autotune
);
cache
.
Set
(
key
,
node
);
cache
.
Set
(
key
,
phi
::
autotune
::
ConvAutoTuneResult
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
,
false
));
}
}
}
...
...
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
浏览文件 @
bc47e7ac
...
...
@@ -69,10 +69,15 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template
<
typename
HandleT
,
typename
DataT
>
struct
ConvArgsBase
{
HandleT
handle
;
paddle
::
platform
::
TensorDescriptor
idesc
,
odesc
;
paddle
::
platform
::
TensorDescriptor
idesc
;
paddle
::
platform
::
TensorDescriptor
odesc
;
paddle
::
platform
::
FilterDescriptor
wdesc
;
paddle
::
platform
::
ConvolutionDescriptor
cdesc
;
const
phi
::
DenseTensor
*
x
,
*
w
,
*
o
;
const
phi
::
DenseTensor
*
x
=
nullptr
;
const
phi
::
DenseTensor
*
w
=
nullptr
;
const
phi
::
DenseTensor
*
o
=
nullptr
;
DataT
cudnn_dtype
;
// strides
...
...
@@ -88,7 +93,8 @@ struct ConvArgsBase {
// data foramt
GPUDNNDataLayout
data_layout
;
ConvArgsBase
(
const
phi
::
DenseTensor
*
x
,
ConvArgsBase
(
const
HandleT
&
h
,
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
w
,
const
phi
::
DenseTensor
*
o
,
const
std
::
vector
<
int
>
s
,
...
...
@@ -97,7 +103,8 @@ struct ConvArgsBase {
DataT
dtype
,
int
g
,
GPUDNNDataLayout
layout
)
:
x
(
x
),
:
handle
(
h
),
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
...
...
@@ -108,7 +115,7 @@ struct ConvArgsBase {
data_layout
(
layout
)
{}
template
<
typename
T
>
phi
::
autotune
::
ConvCacheKey
Convert
2
ConvCacheKey
()
const
{
phi
::
autotune
::
ConvCacheKey
Convert
To
ConvCacheKey
()
const
{
auto
x_shape
=
phi
::
vectorize
(
x
->
dims
());
auto
w_shape
=
phi
::
vectorize
(
w
->
dims
());
VLOG
(
10
)
<<
"[ConvArgs] x_dims="
<<
x_shape
<<
", w_dims="
<<
w_shape
...
...
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -257,7 +257,8 @@ void ConvCudnnGradGradKernel(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
paddle
::
platform
::
DataLayout
::
kNCHW
);
ConvArgs
args1
{
&
transformed_ddX
,
ConvArgs
args1
{
handle
,
&
transformed_ddX
,
W
,
&
transformed_ddO_channel
,
strides
,
...
...
@@ -266,7 +267,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_X
,
ConvArgs
args2
{
handle
,
&
transformed_X
,
ddW
,
&
transformed_ddO_channel
,
strides
,
...
...
@@ -275,7 +277,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_ddX
,
ConvArgs
args3
{
handle
,
&
transformed_ddX
,
dW
,
&
transformed_dO_channel
,
strides
,
...
...
@@ -284,7 +287,8 @@ void ConvCudnnGradGradKernel(
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dX
,
ConvArgs
args4
{
handle
,
&
transformed_dX
,
ddW
,
&
transformed_dO_channel
,
strides
,
...
...
@@ -314,7 +318,6 @@ void ConvCudnnGradGradKernel(
ddy
=
ddO
->
data
<
T
>
();
transformed_ddy_channel
=
transformed_ddO_channel
.
data
<
T
>
();
if
(
ddX
)
{
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args1
.
wdesc
.
set
(
*
W
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
...
...
@@ -339,7 +342,6 @@ void ConvCudnnGradGradKernel(
if
(
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_X
,
iwo_group
);
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
...
...
@@ -367,7 +369,6 @@ void ConvCudnnGradGradKernel(
if
(
dW
&&
ddX
)
{
dw
=
dW
->
data
<
T
>
();
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dW
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
...
...
@@ -395,7 +396,6 @@ void ConvCudnnGradGradKernel(
if
(
ddW
&&
dX
)
{
transformed_dx
=
transformed_dX
.
data
<
T
>
();
args4
.
handle
=
handle
;
args4
.
idesc
.
set
(
transformed_dX
,
iwo_group
);
args4
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args4
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
...
...
@@ -444,13 +444,13 @@ void ConvCudnnGradGradKernel(
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
w
or
kspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
ddO
)
{
if
(
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionForward
(
...
...
@@ -471,7 +471,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -496,7 +496,7 @@ void ConvCudnnGradGradKernel(
if
(
ddW
)
{
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionForward
(
...
...
@@ -517,7 +517,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -547,7 +547,7 @@ void ConvCudnnGradGradKernel(
if
(
dW
&&
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
...
...
@@ -568,7 +568,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
...
...
@@ -594,7 +594,7 @@ void ConvCudnnGradGradKernel(
if
(
dX
&&
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenConvolutionBackwardData
(
...
...
@@ -615,7 +615,7 @@ void ConvCudnnGradGradKernel(
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnConvolutionBackwardData
(
...
...
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -251,12 +251,14 @@ void ConvCudnnGradKernel(const Context& ctx,
T
*
input_grad_data
=
nullptr
;
T
*
transformed_input_grad_data
=
nullptr
;
auto
handle
=
ctx
.
cudnn_handle
();
paddle
::
platform
::
DataLayout
layout
=
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
?
paddle
::
platform
::
DataLayout
::
kNHWC
:
paddle
::
platform
::
DataLayout
::
kNCHW
;
ConvArgs
args1
{
&
transformed_input_grad
,
ConvArgs
args1
{
handle
,
&
transformed_input_grad
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
strides
,
...
...
@@ -265,7 +267,8 @@ void ConvCudnnGradKernel(const Context& ctx,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_input
,
ConvArgs
args2
{
handle
,
&
transformed_input
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
...
...
@@ -275,7 +278,6 @@ void ConvCudnnGradKernel(const Context& ctx,
groups
,
layout
};
auto
handle
=
ctx
.
cudnn_handle
();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
if
(
transformed_input
.
dims
().
size
()
==
5
)
{
...
...
@@ -332,10 +334,7 @@ void ConvCudnnGradKernel(const Context& ctx,
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
// input data workspace_size
size_t
workspace_size_d
=
0
;
// weight workspace_size
size_t
workspace_size_w
=
0
;
size_t
workspace_size
=
0
;
int
iwo_groups
=
groups
;
int
c_groups
=
1
;
...
...
@@ -350,7 +349,6 @@ void ConvCudnnGradKernel(const Context& ctx,
input_grad_data
=
input_grad
->
data
<
T
>
();
transformed_input_grad_data
=
transformed_input_grad
.
data
<
T
>
();
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_input_grad
,
layout_tensor
);
args1
.
wdesc
.
set
(
transformed_filter_channel
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
...
...
@@ -363,21 +361,20 @@ void ConvCudnnGradKernel(const Context& ctx,
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size_d
=
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size
_d
,
ctx
);
args1
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
deterministic
);
workspace_size
_d
=
std
::
max
(
workspace_size_d
,
bwd_result
.
workspace_size
);
workspace_size
=
std
::
max
(
workspace_size
,
bwd_result
.
workspace_size
);
#endif
}
if
(
filter_grad
)
{
// ------------------- cudnn descriptors ---------------------
filter_grad_data
=
transformed_filter_grad_channel
.
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_input
,
layout_tensor
);
args2
.
wdesc
.
set
(
transformed_filter_grad_channel
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
...
...
@@ -389,17 +386,16 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size
_w
,
ctx
);
args2
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
deterministic
);
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
<<
filter_result
.
time
;
workspace_size
_w
=
std
::
max
(
workspace_size_w
,
filter_result
.
workspace_size
);
workspace_size
=
std
::
max
(
workspace_size
,
filter_result
.
workspace_size
);
#endif
}
...
...
@@ -438,9 +434,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
idesc
.
desc
(),
temp_tensor_data
,
cudnn_workspace_ptr
,
workspace_size
_d
));
workspace_size
));
},
workspace_size
_d
);
workspace_size
);
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenOpTensor
(
handle
,
miopenTensorOpAdd
,
...
...
@@ -470,9 +466,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
,
cudnn_workspace_ptr
,
workspace_size
_d
));
workspace_size
));
},
workspace_size
_d
);
workspace_size
);
}
#else
...
...
@@ -490,12 +486,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
cdesc
.
desc
(),
bwd_result
.
algo
,
cudnn_workspace_ptr
,
workspace_size
_d
,
workspace_size
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
+
i
*
group_offset_in
));
},
workspace_size
_d
);
workspace_size
);
}
#endif
if
(
!
is_sys_pad
)
{
...
...
@@ -551,9 +547,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
wdesc
.
desc
(),
filter_grad_data
,
cudnn_workspace_ptr
,
workspace_size
_w
));
workspace_size
));
},
workspace_size
_w
);
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
...
...
@@ -569,12 +565,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
cdesc
.
desc
(),
filter_result
.
algo
,
cudnn_workspace_ptr
,
workspace_size
_w
,
workspace_size
,
&
beta_filter
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
i
*
group_offset_filter
));
},
workspace_size
_w
);
workspace_size
);
}
#endif
...
...
paddle/phi/kernels/gpudnn/conv_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -201,11 +201,14 @@ void ConvCudnnKernel(const Context& ctx,
}
const
T
*
input_data
=
transformed_input
.
data
<
T
>
();
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
// ------------------- cudnn descriptors ---------------------
ConvArgs
args
{
&
transformed_input
,
ConvArgs
args
{
handle
,
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_output
,
strides
,
...
...
@@ -215,8 +218,6 @@ void ConvCudnnKernel(const Context& ctx,
groups
,
compute_format
};
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
paddle
::
platform
::
DataLayout
layout
=
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
?
paddle
::
platform
::
DataLayout
::
kNHWC
...
...
@@ -228,8 +229,6 @@ void ConvCudnnKernel(const Context& ctx,
}
auto
layout_format
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
args
.
handle
=
handle
;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args
.
cdesc
.
set
(
dtype
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -172,8 +172,10 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#endif
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
auto
handle
=
ctx
.
cudnn_handle
();
ConvArgs
args1
{
&
transformed_dout
,
ConvArgs
args1
{
handle
,
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
...
...
@@ -182,7 +184,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_dout
,
ConvArgs
args2
{
handle
,
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
...
...
@@ -202,14 +205,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
size_t
workspace_size
=
0
;
auto
handle
=
ctx
.
cudnn_handle
();
bool
deterministic
=
FLAGS_cudnn_deterministic
;
T
*
dx_data
=
nullptr
;
T
*
dfilter_data
=
nullptr
;
if
(
dx
)
{
dx_data
=
ctx
.
template
Alloc
<
T
>(
dx
);
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_dout
,
iwo_groups
);
args1
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
x_transpose
,
iwo_groups
);
...
...
@@ -234,7 +236,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
if
(
dfilter
)
{
dfilter_data
=
ctx
.
template
Alloc
<
T
>(
dfilter
);
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_dout
,
iwo_groups
);
args2
.
wdesc
.
set
(
*
dfilter
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
x_transpose
,
iwo_groups
);
...
...
@@ -625,7 +627,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto
handle
=
ctx
.
cudnn_handle
();
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
ConvArgs
args1
{
&
transformed_ddout_channel
,
ConvArgs
args1
{
handle
,
&
transformed_ddout_channel
,
&
filter
,
&
transformed_ddx
,
strides
,
...
...
@@ -634,7 +637,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_ddout_channel
,
ConvArgs
args2
{
handle
,
&
transformed_ddout_channel
,
&
ddfilter
,
&
transformed_x
,
strides
,
...
...
@@ -644,7 +648,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_dout
,
ConvArgs
args3
{
handle
,
&
transformed_dout
,
dfilter
,
&
transformed_ddx_channel
,
strides
,
...
...
@@ -653,7 +658,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dout
,
ConvArgs
args4
{
handle
,
&
transformed_dout
,
&
ddfilter
,
&
transformed_dx_channel
,
strides
,
...
...
@@ -683,7 +689,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddout_
=
ddout
->
data
<
T
>
();
transformed_ddout_channel_
=
transformed_ddout_channel
.
data
<
T
>
();
args1
.
handle
=
handle
;
args1
.
idesc
.
set
(
transformed_ddout_channel
,
iwo_group
);
args1
.
wdesc
.
set
(
filter
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddx
,
iwo_group
);
...
...
@@ -730,7 +735,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
if
(
dfilter
)
{
dfilter_
=
dfilter
->
data
<
T
>
();
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_dout
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dfilter
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_ddx_channel
,
iwo_group
);
...
...
@@ -806,13 +811,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
w
or
kspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
ddout
)
{
ddx_
=
transformed_ddx
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardData
(
handle
,
...
...
@@ -831,7 +836,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardData
(
handle
,
...
...
@@ -858,7 +863,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
DenseTensor
conv_x_ddfilter
(
dout
.
type
());
conv_x_ddfilter
.
Resize
(
transformed_ddout_channel
.
dims
());
T
*
conv_x_ddfilter_data
=
ctx
.
template
Alloc
<
T
>(
&
conv_x_ddfilter
);
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardData
(
handle
,
...
...
@@ -889,7 +894,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2
.
idesc
.
desc
(),
transformed_ddout_channel_
+
i
*
group_offset_out
));
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardData
(
handle
,
...
...
@@ -944,7 +949,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddx_
=
transformed_ddx_channel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionBackwardWeights
(
...
...
@@ -964,7 +969,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
...
...
@@ -990,7 +995,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddfilter_
=
ddfilter
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenConvolutionForward
(
handle
,
...
...
@@ -1009,7 +1014,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
w
or
kspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
cudnnConvolutionForward
(
handle
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
浏览文件 @
bc47e7ac
...
...
@@ -199,7 +199,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
// ------------------- cudnn descriptors ---------------------
ConvArgs
args
{
&
transformed_out
,
ConvArgs
args
{
handle
,
&
transformed_out
,
&
filter
,
&
transformed_x
,
strides
,
...
...
@@ -208,7 +209,6 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
dtype
,
groups
,
data_layout
};
args
.
handle
=
handle
;
args
.
idesc
.
set
(
transformed_out
,
iwo_groups
);
args
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
args
.
odesc
.
set
(
transformed_x
,
iwo_groups
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录