Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
31f57f29
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
31f57f29
编写于
10月 24, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
10月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Move the header file of conv cudnn and miopen to phi directory. (#47248)
上级
a5f556f0
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
415 addition
and
502 deletion
+415
-502
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+1
-2
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+0
-2
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+6
-6
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+3
-3
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+51
-90
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+51
-20
paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
+3
-5
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+58
-68
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+70
-78
paddle/phi/kernels/gpudnn/conv_kernel.cu
paddle/phi/kernels/gpudnn/conv_kernel.cu
+45
-46
paddle/phi/kernels/gpudnn/conv_miopen_helper.h
paddle/phi/kernels/gpudnn/conv_miopen_helper.h
+9
-46
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+99
-115
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+17
-19
paddle/phi/kernels/impl/conv_cudnn_impl.h
paddle/phi/kernels/impl/conv_cudnn_impl.h
+2
-2
未找到文件。
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
31f57f29
...
...
@@ -26,15 +26,14 @@
#endif
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
31f57f29
...
...
@@ -26,7 +26,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
...
...
@@ -34,7 +33,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
31f57f29
...
...
@@ -16,10 +16,10 @@ limitations under the License. */
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
DECLARE_int64
(
cudnn_exhaustive_search_times
);
...
...
@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
...
...
@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
new
cudnnConvolutionFwdAlgoPerf_t
[
phi
::
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
handle
,
...
...
@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
...
...
@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
[
&
]()
->
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
{
int
returned_algo_count
;
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
phi
::
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
...
...
@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
31f57f29
...
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
new
cudnnConvolutionFwdAlgoPerf_t
[
phi
::
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
handle
,
...
...
@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
[
i
]
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
...
...
paddle/
fluid/operators/conv_cudnn_helper
.h
→
paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h
浏览文件 @
31f57f29
...
...
@@ -14,52 +14,15 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
namespace
paddle
{
namespace
operators
{
namespace
phi
{
using
ConvArgs
=
ConvArgsBase
<
cudnnHandle_t
,
cudnnDataType_t
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
auto
extents
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
phi
::
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
}
static
inline
double
ToMegaBytes
(
size_t
bytes
)
{
return
static_cast
<
double
>
(
bytes
)
/
(
1
<<
20
);
}
...
...
@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() {
static
size_t
CalcWorkspaceLimitInBytes
(
bool
use_fixed_workspace
)
{
if
(
!
use_fixed_workspace
)
{
int
device_id
=
p
latform
::
GetCurrentDeviceId
();
int
device_id
=
p
hi
::
backends
::
gpu
::
GetCurrentDeviceId
();
int64_t
allocated
=
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
device_id
);
paddle
::
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
device_id
);
int64_t
reserved
=
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
device_id
);
int64_t
availble
=
platform
::
GpuAvailableMemToAlloc
();
paddle
::
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
device_id
);
int64_t
availble
=
p
addle
::
p
latform
::
GpuAvailableMemToAlloc
();
VLOG
(
3
)
<<
"[memory] allocated="
<<
ToMegaBytes
(
allocated
)
<<
" MB, reserved="
<<
ToMegaBytes
(
reserved
)
<<
" MB, available_to_alloc="
<<
ToMegaBytes
(
availble
)
<<
" MB."
;
...
...
@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
cudnnConvolutionFwdAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
phi
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
return
workspace_size
;
}
...
...
@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
...
...
@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
size_t
max_workspace_size
=
0
;
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_FWD_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
auto
status
=
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
algo
),
&
workspace_size
);
auto
status
=
phi
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
algo
),
&
workspace_size
);
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
workspace_size
<=
workspace_size_limit
)
{
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
...
...
@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
cudnnConvolutionBwdDataAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
...
...
@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_DATA_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
p
addle
::
p
latform
::
CUDAGraphCaptureModeGuard
guard
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_FILTER_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
platform
::
CudnnDataType
<
T
>::
type
!=
CUDNN_DATA_HALF
)
{
if
(
p
addle
::
p
latform
::
CudnnDataType
<
T
>::
type
!=
CUDNN_DATA_HALF
)
{
size_t
max_workspace_size
=
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
VLOG
(
3
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
...
...
@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
...
...
@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
max_algos
=
GetAlgorithmMaxCount
(
args
.
handle
);
std
::
vector
<
PerfT
>
perf_results
(
max_algos
);
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDNN_VERSION_MIN(7, 0, 1)
int
max_algos
=
0
;
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
handle
,
&
max_algos
);
if
(
status
==
gpuSuccess
)
{
VLOG
(
5
)
<<
"[BackwardFilter] max_algos: predefined="
...
...
@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_FILTER_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
...
@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
bool
enable_autotune
=
true
)
{
SearchResult
<
AlgoT
>
result
;
bool
use_autotune
=
false
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
p
addle
::
p
latform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
...
...
@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
return
result
;
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
paddle
::
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"Enable Tensor Core for FLOAT16"
;
#if CUDA_VERSION >= 11000
...
...
@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
VLOG
(
5
)
<<
"Enable Tensor Core for BFLOAT16"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
VLOG
(
5
)
<<
"Disable TensorFloat (Tensor Core) for FLOAT"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
}
#endif
}
};
}
// namespace operators
}
// namespace paddle
}
// namespace phi
paddle/
fluid/operators/conv_base_helper
.h
→
paddle/
phi/kernels/gpudnn/conv_gpudnn_base
.h
浏览文件 @
31f57f29
...
...
@@ -20,21 +20,19 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace
paddle
{
namespace
operators
{
namespace
phi
{
using
Tensor
=
phi
::
DenseTensor
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
using
framework
::
ConvSearchCache
;
using
GPUDNNDataLayout
=
paddle
::
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
ScalingParamType
=
typename
paddle
::
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
// As the container of searchAlgorithm::Find() result.
template
<
typename
AlgoT
>
...
...
@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template
<
typename
HandleT
,
typename
DataT
>
struct
ConvArgsBase
{
HandleT
handle
;
platform
::
TensorDescriptor
idesc
,
odesc
;
platform
::
FilterDescriptor
wdesc
;
platform
::
ConvolutionDescriptor
cdesc
;
p
addle
::
p
latform
::
TensorDescriptor
idesc
,
odesc
;
p
addle
::
p
latform
::
FilterDescriptor
wdesc
;
p
addle
::
p
latform
::
ConvolutionDescriptor
cdesc
;
const
phi
::
DenseTensor
*
x
,
*
w
,
*
o
;
DataT
cudnn_dtype
;
...
...
@@ -88,7 +86,7 @@ struct ConvArgsBase {
int
group
;
// data foramt
DataLayout
data_layout
;
GPUDNN
DataLayout
data_layout
;
ConvArgsBase
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
w
,
...
...
@@ -98,7 +96,7 @@ struct ConvArgsBase {
const
std
::
vector
<
int
>
d
,
DataT
dtype
,
int
g
,
DataLayout
layout
)
GPUDNN
DataLayout
layout
)
:
x
(
x
),
w
(
w
),
o
(
o
),
...
...
@@ -131,16 +129,16 @@ struct ConvArgsBase {
}
};
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
layout
,
static
inline
void
GetNCDHW
(
const
phi
::
DDim
&
dims
,
const
GPUDNN
DataLayout
&
layout
,
int
*
N
,
int
*
C
,
int
*
D
,
int
*
H
,
int
*
W
)
{
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
*
C
=
layout
==
GPUDNN
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
GPUDNN
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
...
...
@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims,
}
}
}
// namespace operators
}
// namespace paddle
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
auto
extents
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
phi
::
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
}
}
// namespace phi
paddle/
fluid/operators/conv_cudnn_op_cache
.h
→
paddle/
phi/kernels/gpudnn/conv_gpudnn_info
.h
浏览文件 @
31f57f29
...
...
@@ -18,15 +18,14 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_int64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
namespace
p
addle
{
namespace
operators
{
namespace
p
hi
{
#ifdef PADDLE_WITH_HIP
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
1
;
...
...
@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT
;
#endif
}
// namespace operators
}
// namespace paddle
}
// namespace phi
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
浏览文件 @
31f57f29
...
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
...
@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
paddle
::
platform
::
DataLayout
::
kNCHW
);
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_ddX
,
W
,
&
transformed_ddO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_X
,
ddW
,
&
transformed_ddO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args3
{
&
transformed_ddX
,
dW
,
&
transformed_dO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args4
{
&
transformed_dX
,
ddW
,
&
transformed_dO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args1
{
&
transformed_ddX
,
W
,
&
transformed_ddO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_X
,
ddW
,
&
transformed_ddO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_ddX
,
dW
,
&
transformed_dO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dX
,
ddW
,
&
transformed_dO_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result1
;
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result2
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
data_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result1
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result2
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
data_result
;
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result1
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result2
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
data_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result1
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result2
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
data_result
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
// ddo = conv(ddI, W) + conv(I, ddW)
...
...
@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel(
c_group
);
#ifdef PADDLE_WITH_HIP
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
fwd_result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
false
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_result1
.
algo
);
#endif
...
...
@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel(
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search2
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
fwd_result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
using
search2
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_result2
.
algo
));
...
...
@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel(
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search3
::
Find
<
T
>
(
ctx
,
args3
,
exhaustive_search
,
deterministic
);
workspace_size
=
std
::
max
(
...
...
@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel(
c_group
);
#ifdef PADDLE_WITH_HIP
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
using
search4
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
using
search4
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_result
=
search4
::
Find
<
T
>
(
ctx
,
args4
,
exhaustive_search
,
deterministic
);
workspace_size
=
std
::
max
(
...
...
@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel(
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_filter
=
W
->
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
...
...
@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel(
axes
[
i
]
=
i
;
}
if
(
X
->
dims
().
size
()
==
4
)
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
4
>
(
RemovePaddingSlice
<
Context
,
T
,
4
>
(
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
}
else
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
5
>
(
RemovePaddingSlice
<
Context
,
T
,
5
>
(
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
}
}
...
...
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
浏览文件 @
31f57f29
...
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
...
@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx,
?
paddle
::
platform
::
DataLayout
::
kNHWC
:
paddle
::
platform
::
DataLayout
::
kNCHW
;
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_input_grad
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
layout
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_input
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
layout
};
ConvArgs
args1
{
&
transformed_input_grad
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_input
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
layout
};
auto
handle
=
ctx
.
cudnn_handle
();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
...
...
@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx,
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
else
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
...
...
@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx,
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
// input data workspace_size
size_t
workspace_size_d
=
0
;
...
...
@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size_d
=
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size_d
,
ctx
);
#else
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
deterministic
);
workspace_size_d
=
std
::
max
(
workspace_size_d
,
bwd_result
.
workspace_size
);
#endif
...
...
@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size_w
,
ctx
);
#else
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
deterministic
);
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
...
...
@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx,
}
// ------------------- cudnn conv backward data ---------------------
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
#else
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
#endif
VLOG
(
4
)
<<
"Conv_grad: use_addto = "
<<
use_addto
;
...
...
@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx,
ctx
.
template
Alloc
<
T
>(
&
transformed_input_grad_channel
);
if
(
transformed_input_channel
.
dims
().
size
()
==
4
)
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
4
>
(
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
starts
,
axes
);
RemovePaddingSlice
<
Context
,
T
,
4
>
(
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
starts
,
axes
);
}
else
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
5
>
(
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
starts
,
axes
);
RemovePaddingSlice
<
Context
,
T
,
5
>
(
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
starts
,
axes
);
}
}
...
...
@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx,
}
// filter_grad do not use inplace addto.
paddle
::
operators
::
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
// ------------------- cudnn conv backward filter ---------------------
if
(
filter_grad
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
...
...
paddle/phi/kernels/gpudnn/conv_kernel.cu
浏览文件 @
31f57f29
...
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
...
@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx,
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
// ------------------- cudnn descriptors ---------------------
paddle
::
operators
::
ConvArgs
args
{
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_output
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
compute_format
};
ConvArgs
args
{
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_output
,
strides
,
padding_common
,
dilations
,
dtype
,
groups
,
compute_format
};
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
...
...
@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx,
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
else
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
...
...
@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx,
size_t
workspace_size
=
0
;
// final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
using
search
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
fwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
using
search
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
exhaustive_search
,
deterministic
);
workspace_size
=
fwd_result
.
workspace_size
;
#endif
...
...
@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx,
#endif
// ------------------- cudnn conv forward ---------------------
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
...
...
paddle/
fluid/operators
/conv_miopen_helper.h
→
paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h
浏览文件 @
31f57f29
...
...
@@ -14,48 +14,12 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
namespace
paddle
{
namespace
operators
{
namespace
phi
{
using
ConvArgs
=
ConvArgsBase
<
miopenHandle_t
,
miopenDataType_t
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
array
<
int
,
D
>
();
auto
extents
=
Eigen
::
array
<
int
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
...
...
@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
...
...
@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
idesc
.
desc
(),
...
...
@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
...
...
@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
...
...
@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
idesc
.
desc
(),
...
...
@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
}
};
}
// namespace operators
}
// namespace paddle
}
// namespace phi
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
浏览文件 @
31f57f29
...
...
@@ -28,11 +28,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
namespace
phi
{
...
...
@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
layout
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
layout
};
ConvArgs
args1
{
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
layout
};
ConvArgs
args2
{
&
transformed_dout
,
&
filter
,
&
x_transpose
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
layout
};
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
...
...
@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
fwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
fwd_result
.
algo
));
...
...
@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_result
.
algo
));
...
...
@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
int
dout_offset
=
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
dx
)
{
// Because beta is zero, it is unnecessary to reset dx.
...
...
@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto
handle
=
ctx
.
cudnn_handle
();
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_ddout_channel
,
&
filter
,
&
transformed_ddx
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_ddout_channel
,
&
ddfilter
,
&
transformed_x
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args3
{
&
transformed_dout
,
dfilter
,
&
transformed_ddx_channel
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args4
{
&
transformed_dout
,
&
ddfilter
,
&
transformed_dx_channel
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args1
{
&
transformed_ddout_channel
,
&
filter
,
&
transformed_ddx
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args2
{
&
transformed_ddout_channel
,
&
ddfilter
,
&
transformed_x
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args3
{
&
transformed_dout
,
dfilter
,
&
transformed_ddx_channel
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
ConvArgs
args4
{
&
transformed_dout
,
&
ddfilter
,
&
transformed_dx_channel
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result1
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result2
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result1
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result2
;
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result1
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result2
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result1
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result2
;
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
#endif
// ddo = conv(ddI, filter) + conv(I, ddfilter)
...
...
@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
bwd_result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_result1
.
algo
);
#endif
...
...
@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
using
search2
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
bwd_result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_result2
.
algo
));
...
...
@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search3
::
Find
<
T
>
(
ctx
,
args3
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_result
.
algo
));
...
...
@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search4
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
fwd_result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
using
search4
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search4
::
Find
<
T
>
(
ctx
,
args4
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
fwd_result
.
algo
));
...
...
@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
paddle
::
operators
::
GetNCDHW
(
transformed_x
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
GetNCDHW
(
transformed_x
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
&
i_n
,
&
i_c
,
&
i_d
,
&
i_h
,
&
i_w
);
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
paddle
::
operators
::
GetNCDHW
(
transformed_dout
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
GetNCDHW
(
transformed_dout
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
&
o_n
,
&
o_c
,
&
o_d
,
&
o_h
,
&
o_w
);
int
group_offset_in
=
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
...
...
@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
int
group_offset_filter
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
...
...
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
浏览文件 @
31f57f29
...
...
@@ -26,11 +26,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
namespace
phi
{
...
...
@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
// ------------------- cudnn descriptors ---------------------
paddle
::
operators
::
ConvArgs
args
{
&
transformed_out
,
&
filter
,
&
transformed_x
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
data_layout
};
ConvArgs
args
{
&
transformed_out
,
&
filter
,
&
transformed_x
,
strides
,
padding_common
,
dilations_
,
dtype
,
groups
,
data_layout
};
args
.
handle
=
handle
;
args
.
idesc
.
set
(
transformed_out
,
iwo_groups
);
args
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
...
...
@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups
);
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
using
search
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
bwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
bwd_result
.
algo
));
...
...
@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
int
x_offset
=
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
int
out_offset
=
transformed_out
.
numel
()
/
transformed_out
.
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
#ifdef PADDLE_WITH_HIP
...
...
paddle/phi/kernels/impl/conv_cudnn_impl.h
浏览文件 @
31f57f29
...
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录