Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
31f57f29
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2297
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
31f57f29
编写于
10月 24, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
10月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Move the header file of conv cudnn and miopen to phi directory. (#47248)
上级
a5f556f0
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
415 addition
and
502 deletion
+415
-502
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+1
-2
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+0
-2
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+6
-6
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+3
-3
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+51
-90
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+51
-20
paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
+3
-5
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+58
-68
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+70
-78
paddle/phi/kernels/gpudnn/conv_kernel.cu
paddle/phi/kernels/gpudnn/conv_kernel.cu
+45
-46
paddle/phi/kernels/gpudnn/conv_miopen_helper.h
paddle/phi/kernels/gpudnn/conv_miopen_helper.h
+9
-46
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+99
-115
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+17
-19
paddle/phi/kernels/impl/conv_cudnn_impl.h
paddle/phi/kernels/impl/conv_cudnn_impl.h
+2
-2
未找到文件。
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
31f57f29
...
@@ -26,15 +26,14 @@
...
@@ -26,15 +26,14 @@
#endif
#endif
#include <cudnn.h>
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
#endif
#endif
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#endif
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
31f57f29
...
@@ -26,7 +26,6 @@
...
@@ -26,7 +26,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#endif
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
...
@@ -34,7 +33,6 @@
...
@@ -34,7 +33,6 @@
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" // NOLINT
#endif
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_XPU_BKCL)
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
31f57f29
...
@@ -16,10 +16,10 @@ limitations under the License. */
...
@@ -16,10 +16,10 @@ limitations under the License. */
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/funcs/padding.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
DECLARE_int64
(
cudnn_exhaustive_search_times
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
...
@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -216,7 +216,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
cudnn_output_desc
,
output_data
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
find_count
,
&
find_count
,
&
find_result
,
&
find_result
,
cudnn_workspace_ptr
,
cudnn_workspace_ptr
,
...
@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -337,7 +337,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
new
cudnnConvolutionFwdAlgoPerf_t
[
phi
::
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
handle
,
handle
,
...
@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -345,7 +345,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
cudnn_output_desc
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
&
perf_count
,
perf_results
.
get
()));
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
...
@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -378,7 +378,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
[
&
]()
->
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
{
[
&
]()
->
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
{
int
returned_algo_count
;
int
returned_algo_count
;
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
SearchFuseResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
phi
::
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
...
@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -391,7 +391,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
cudnn_output_desc
,
output_data
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
fwd_perf_stat
.
data
(),
cudnn_workspace
,
cudnn_workspace
,
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
31f57f29
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
...
@@ -206,7 +206,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
new
cudnnConvolutionFwdAlgoPerf_t
[
phi
::
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
handle
,
handle
,
...
@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
...
@@ -214,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
filter_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
out_desc
[
i
],
kNUM_CUDNN_FWD_ALGS
,
phi
::
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
&
perf_count
,
perf_results
.
get
()));
perf_results
.
get
()));
algo
[
i
]
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
algo
[
i
]
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
...
...
paddle/
fluid/operators/conv_cudnn_helper
.h
→
paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h
浏览文件 @
31f57f29
...
@@ -14,52 +14,15 @@ limitations under the License. */
...
@@ -14,52 +14,15 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
using
ConvArgs
=
ConvArgsBase
<
cudnnHandle_t
,
cudnnDataType_t
>
;
using
ConvArgs
=
ConvArgsBase
<
cudnnHandle_t
,
cudnnDataType_t
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
auto
extents
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
phi
::
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
}
static
inline
double
ToMegaBytes
(
size_t
bytes
)
{
static
inline
double
ToMegaBytes
(
size_t
bytes
)
{
return
static_cast
<
double
>
(
bytes
)
/
(
1
<<
20
);
return
static_cast
<
double
>
(
bytes
)
/
(
1
<<
20
);
}
}
...
@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() {
...
@@ -70,12 +33,12 @@ static inline bool UseFixedWorkspace() {
static
size_t
CalcWorkspaceLimitInBytes
(
bool
use_fixed_workspace
)
{
static
size_t
CalcWorkspaceLimitInBytes
(
bool
use_fixed_workspace
)
{
if
(
!
use_fixed_workspace
)
{
if
(
!
use_fixed_workspace
)
{
int
device_id
=
p
latform
::
GetCurrentDeviceId
();
int
device_id
=
p
hi
::
backends
::
gpu
::
GetCurrentDeviceId
();
int64_t
allocated
=
int64_t
allocated
=
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
device_id
);
paddle
::
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
device_id
);
int64_t
reserved
=
int64_t
reserved
=
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
device_id
);
paddle
::
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
device_id
);
int64_t
availble
=
platform
::
GpuAvailableMemToAlloc
();
int64_t
availble
=
p
addle
::
p
latform
::
GpuAvailableMemToAlloc
();
VLOG
(
3
)
<<
"[memory] allocated="
<<
ToMegaBytes
(
allocated
)
VLOG
(
3
)
<<
"[memory] allocated="
<<
ToMegaBytes
(
allocated
)
<<
" MB, reserved="
<<
ToMegaBytes
(
reserved
)
<<
" MB, reserved="
<<
ToMegaBytes
(
reserved
)
<<
" MB, available_to_alloc="
<<
ToMegaBytes
(
availble
)
<<
" MB."
;
<<
" MB, available_to_alloc="
<<
ToMegaBytes
(
availble
)
<<
" MB."
;
...
@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -164,14 +127,13 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
cudnnConvolutionFwdAlgo_t
algo
)
{
cudnnConvolutionFwdAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
phi
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
algo
,
&
workspace_size
));
&
workspace_size
));
return
workspace_size
;
return
workspace_size
;
}
}
...
@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -193,7 +155,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -220,7 +182,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -233,7 +195,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
}
}
#else
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -261,7 +223,7 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
x
->
data
<
T
>
(),
...
@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -299,15 +261,14 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
size_t
max_workspace_size
=
0
;
size_t
max_workspace_size
=
0
;
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_FWD_ALGS
;
++
algo
)
{
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_FWD_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
auto
status
=
auto
status
=
phi
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
algo
),
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
algo
),
&
workspace_size
);
&
workspace_size
);
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
workspace_size
<=
workspace_size_limit
)
{
workspace_size
<=
workspace_size_limit
)
{
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
...
@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -339,7 +300,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
cudnnConvolutionBwdDataAlgo_t
algo
)
{
cudnnConvolutionBwdDataAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -369,7 +330,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -404,7 +365,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -417,7 +378,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
}
}
#else
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -445,7 +406,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
w
->
data
<
T
>
(),
...
@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -484,7 +445,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_DATA_ALGS
;
++
algo
)
{
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_DATA_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
auto
status
=
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -519,10 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
p
addle
::
p
latform
::
CUDAGraphCaptureModeGuard
guard
;
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -552,7 +513,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_FILTER_ALGS
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_FILTER_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -575,7 +536,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -588,7 +549,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
}
#else
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -612,7 +573,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
size_t
workspace_size_limit
=
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
platform
::
CudnnDataType
<
T
>::
type
!=
CUDNN_DATA_HALF
)
{
if
(
p
addle
::
p
latform
::
CudnnDataType
<
T
>::
type
!=
CUDNN_DATA_HALF
)
{
size_t
max_workspace_size
=
size_t
max_workspace_size
=
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
VLOG
(
3
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
VLOG
(
3
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
...
@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -620,7 +581,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
x
->
data
<
T
>
(),
...
@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -649,7 +610,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
max_algos
=
GetAlgorithmMaxCount
(
args
.
handle
);
int
max_algos
=
GetAlgorithmMaxCount
(
args
.
handle
);
std
::
vector
<
PerfT
>
perf_results
(
max_algos
);
std
::
vector
<
PerfT
>
perf_results
(
max_algos
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
p
hi
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -676,7 +637,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDNN_VERSION_MIN(7, 0, 1)
#if CUDNN_VERSION_MIN(7, 0, 1)
int
max_algos
=
0
;
int
max_algos
=
0
;
auto
status
=
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
handle
,
&
max_algos
);
handle
,
&
max_algos
);
if
(
status
==
gpuSuccess
)
{
if
(
status
==
gpuSuccess
)
{
VLOG
(
5
)
<<
"[BackwardFilter] max_algos: predefined="
VLOG
(
5
)
<<
"[BackwardFilter] max_algos: predefined="
...
@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -694,7 +655,7 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_FILTER_ALGS
;
++
algo
)
{
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_FILTER_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
auto
status
=
auto
status
=
p
latform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
p
hi
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
...
@@ -762,7 +723,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
bool
enable_autotune
=
true
)
{
bool
enable_autotune
=
true
)
{
SearchResult
<
AlgoT
>
result
;
SearchResult
<
AlgoT
>
result
;
bool
use_autotune
=
false
;
bool
use_autotune
=
false
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
p
addle
::
p
latform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
if
(
deterministic
)
{
...
@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
...
@@ -819,12 +780,13 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
return
result
;
return
result
;
}
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
static
void
SetConvMathType
(
cudnnDataType_t
dtype
,
const
phi
::
GPUContext
&
ctx
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
cudnnDataType_t
dtype
,
const
paddle
::
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"Enable Tensor Core for FLOAT16"
;
VLOG
(
5
)
<<
"Enable Tensor Core for FLOAT16"
;
#if CUDA_VERSION >= 11000
#if CUDA_VERSION >= 11000
...
@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
...
@@ -832,21 +794,20 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
dtype
==
CUDNN_DATA_BFLOAT16
)
{
VLOG
(
5
)
<<
"Enable Tensor Core for BFLOAT16"
;
VLOG
(
5
)
<<
"Enable Tensor Core for BFLOAT16"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
VLOG
(
5
)
<<
"Disable TensorFloat (Tensor Core) for FLOAT"
;
VLOG
(
5
)
<<
"Disable TensorFloat (Tensor Core) for FLOAT"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 11000
}
else
{
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
}
}
#endif
#endif
}
}
};
};
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/
fluid/operators/conv_base_helper
.h
→
paddle/
phi/kernels/gpudnn/conv_gpudnn_base
.h
浏览文件 @
31f57f29
...
@@ -20,21 +20,19 @@ limitations under the License. */
...
@@ -20,21 +20,19 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
using
GPUDNNDataLayout
=
paddle
::
platform
::
DataLayout
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
using
framework
::
ConvSearchCache
;
template
<
typename
T
>
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
ScalingParamType
=
typename
paddle
::
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
// As the container of searchAlgorithm::Find() result.
// As the container of searchAlgorithm::Find() result.
template
<
typename
AlgoT
>
template
<
typename
AlgoT
>
...
@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
...
@@ -71,9 +69,9 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template
<
typename
HandleT
,
typename
DataT
>
template
<
typename
HandleT
,
typename
DataT
>
struct
ConvArgsBase
{
struct
ConvArgsBase
{
HandleT
handle
;
HandleT
handle
;
platform
::
TensorDescriptor
idesc
,
odesc
;
p
addle
::
p
latform
::
TensorDescriptor
idesc
,
odesc
;
platform
::
FilterDescriptor
wdesc
;
p
addle
::
p
latform
::
FilterDescriptor
wdesc
;
platform
::
ConvolutionDescriptor
cdesc
;
p
addle
::
p
latform
::
ConvolutionDescriptor
cdesc
;
const
phi
::
DenseTensor
*
x
,
*
w
,
*
o
;
const
phi
::
DenseTensor
*
x
,
*
w
,
*
o
;
DataT
cudnn_dtype
;
DataT
cudnn_dtype
;
...
@@ -88,7 +86,7 @@ struct ConvArgsBase {
...
@@ -88,7 +86,7 @@ struct ConvArgsBase {
int
group
;
int
group
;
// data foramt
// data foramt
DataLayout
data_layout
;
GPUDNN
DataLayout
data_layout
;
ConvArgsBase
(
const
phi
::
DenseTensor
*
x
,
ConvArgsBase
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
w
,
const
phi
::
DenseTensor
*
w
,
...
@@ -98,7 +96,7 @@ struct ConvArgsBase {
...
@@ -98,7 +96,7 @@ struct ConvArgsBase {
const
std
::
vector
<
int
>
d
,
const
std
::
vector
<
int
>
d
,
DataT
dtype
,
DataT
dtype
,
int
g
,
int
g
,
DataLayout
layout
)
GPUDNN
DataLayout
layout
)
:
x
(
x
),
:
x
(
x
),
w
(
w
),
w
(
w
),
o
(
o
),
o
(
o
),
...
@@ -131,16 +129,16 @@ struct ConvArgsBase {
...
@@ -131,16 +129,16 @@ struct ConvArgsBase {
}
}
};
};
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
static
inline
void
GetNCDHW
(
const
phi
::
DDim
&
dims
,
const
DataLayout
&
layout
,
const
GPUDNN
DataLayout
&
layout
,
int
*
N
,
int
*
N
,
int
*
C
,
int
*
C
,
int
*
D
,
int
*
D
,
int
*
H
,
int
*
H
,
int
*
W
)
{
int
*
W
)
{
*
N
=
dims
[
0
];
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
*
C
=
layout
==
GPUDNN
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
int
i
=
layout
==
GPUDNN
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
*
H
=
dims
[
3
-
i
];
...
@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims,
...
@@ -152,5 +150,38 @@ static inline void GetNCDHW(const framework::DDim& dims,
}
}
}
}
}
// namespace operators
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
}
// namespace paddle
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
auto
extents
=
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
phi
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
phi
::
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
}
}
// namespace phi
paddle/
fluid/operators/conv_cudnn_op_cache
.h
→
paddle/
phi/kernels/gpudnn/conv_gpudnn_info
.h
浏览文件 @
31f57f29
...
@@ -18,15 +18,14 @@ limitations under the License. */
...
@@ -18,15 +18,14 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_int64
(
conv_workspace_size_limit
);
DECLARE_int64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
namespace
p
addle
{
namespace
p
hi
{
namespace
operators
{
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
1
;
...
@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
...
@@ -39,5 +38,4 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT
;
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT
;
#endif
#endif
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
浏览文件 @
31f57f29
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel(
...
@@ -257,55 +257,53 @@ void ConvCudnnGradGradKernel(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
paddle
::
platform
::
DataLayout
::
kNCHW
);
paddle
::
platform
::
DataLayout
::
kNCHW
);
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_ddX
,
ConvArgs
args1
{
&
transformed_ddX
,
W
,
W
,
&
transformed_ddO_channel
,
&
transformed_ddO_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_X
,
ConvArgs
args2
{
&
transformed_X
,
ddW
,
ddW
,
&
transformed_ddO_channel
,
&
transformed_ddO_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args3
{
&
transformed_ddX
,
ConvArgs
args3
{
&
transformed_ddX
,
dW
,
dW
,
&
transformed_dO_channel
,
&
transformed_dO_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args4
{
&
transformed_dX
,
ConvArgs
args4
{
&
transformed_dX
,
ddW
,
ddW
,
&
transformed_dO_channel
,
&
transformed_dO_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
paddle
::
platform
::
DataLayout
::
kNCHW
};
paddle
::
platform
::
DataLayout
::
kNCHW
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result1
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result1
;
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result2
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result2
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
data_result
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
data_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
filter_result
;
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result1
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result1
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result2
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result2
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
data_result
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
data_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
filter_result
;
#endif
#endif
// ddo = conv(ddI, W) + conv(I, ddW)
// ddo = conv(ddI, W) + conv(I, ddW)
...
@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel(
...
@@ -328,14 +326,12 @@ void ConvCudnnGradGradKernel(
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search1
=
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
fwd_result1
.
algo
=
search1
::
Find
<
T
>
(
fwd_result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
false
);
fwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
false
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_result1
.
algo
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_result1
.
algo
);
#endif
#endif
...
@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel(
...
@@ -355,15 +351,13 @@ void ConvCudnnGradGradKernel(
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search2
=
using
search2
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
fwd_result2
.
algo
=
search2
::
Find
<
T
>
(
fwd_result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
false
);
fwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_result2
.
algo
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_result2
.
algo
));
...
@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel(
...
@@ -385,14 +379,12 @@ void ConvCudnnGradGradKernel(
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search3
=
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_result
.
algo
=
search3
::
Find
<
T
>
(
filter_result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search3
=
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
filter_result
=
search3
::
Find
<
T
>
(
ctx
,
args3
,
exhaustive_search
,
deterministic
);
search3
::
Find
<
T
>
(
ctx
,
args3
,
exhaustive_search
,
deterministic
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
...
@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel(
...
@@ -415,14 +407,12 @@ void ConvCudnnGradGradKernel(
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search4
=
using
search4
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_result
.
algo
=
search4
::
Find
<
T
>
(
data_result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search4
=
using
search4
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_result
=
data_result
=
search4
::
Find
<
T
>
(
ctx
,
args4
,
exhaustive_search
,
deterministic
);
search4
::
Find
<
T
>
(
ctx
,
args4
,
exhaustive_search
,
deterministic
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
...
@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel(
...
@@ -447,8 +437,8 @@ void ConvCudnnGradGradKernel(
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_filter
=
W
->
numel
()
/
groups
;
int
group_offset_filter
=
W
->
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
...
@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel(
...
@@ -657,10 +647,10 @@ void ConvCudnnGradGradKernel(
axes
[
i
]
=
i
;
axes
[
i
]
=
i
;
}
}
if
(
X
->
dims
().
size
()
==
4
)
{
if
(
X
->
dims
().
size
()
==
4
)
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
4
>
(
RemovePaddingSlice
<
Context
,
T
,
4
>
(
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
}
else
{
}
else
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
5
>
(
RemovePaddingSlice
<
Context
,
T
,
5
>
(
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
ctx
,
&
transformed_dX
,
&
transformed_dX_channel
,
starts
,
axes
);
}
}
}
}
...
...
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
浏览文件 @
31f57f29
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -256,24 +256,24 @@ void ConvCudnnGradKernel(const Context& ctx,
?
paddle
::
platform
::
DataLayout
::
kNHWC
?
paddle
::
platform
::
DataLayout
::
kNHWC
:
paddle
::
platform
::
DataLayout
::
kNCHW
;
:
paddle
::
platform
::
DataLayout
::
kNCHW
;
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_input_grad
,
ConvArgs
args1
{
&
transformed_input_grad
,
&
transformed_filter_channel
,
&
transformed_filter_channel
,
&
transformed_output_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
layout
};
layout
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_input
,
ConvArgs
args2
{
&
transformed_input
,
&
transformed_filter_grad_channel
,
&
transformed_filter_grad_channel
,
&
transformed_output_grad_channel
,
&
transformed_output_grad_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
layout
};
layout
};
auto
handle
=
ctx
.
cudnn_handle
();
auto
handle
=
ctx
.
cudnn_handle
();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
...
@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -289,35 +289,35 @@ void ConvCudnnGradKernel(const Context& ctx,
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_n
,
&
i_c
,
&
i_c
,
&
i_d
,
&
i_d
,
&
i_h
,
&
i_h
,
&
i_w
);
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_n
,
&
o_c
,
&
o_c
,
&
o_d
,
&
o_d
,
&
o_h
,
&
o_h
,
&
o_w
);
&
o_w
);
}
else
{
}
else
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_n
,
&
i_c
,
&
i_c
,
&
i_d
,
&
i_d
,
&
i_h
,
&
i_h
,
&
i_w
);
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
GetNCDHW
(
transformed_output_grad_channel
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_n
,
&
o_c
,
&
o_c
,
&
o_d
,
&
o_d
,
&
o_h
,
&
o_h
,
&
o_w
);
&
o_w
);
}
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
...
@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -326,13 +326,11 @@ void ConvCudnnGradKernel(const Context& ctx,
// ------------------- cudnn backward algorithm ---------------------
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
filter_result
;
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
filter_result
;
#endif
#endif
// input data workspace_size
// input data workspace_size
size_t
workspace_size_d
=
0
;
size_t
workspace_size_d
=
0
;
...
@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -364,15 +362,13 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search1
=
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size_d
=
workspace_size_d
=
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size_d
,
ctx
);
args1
,
exhaustive_search
,
deterministic
,
workspace_size_d
,
ctx
);
#else
#else
using
search1
=
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
deterministic
);
bwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
exhaustive_search
,
deterministic
);
workspace_size_d
=
std
::
max
(
workspace_size_d
,
bwd_result
.
workspace_size
);
workspace_size_d
=
std
::
max
(
workspace_size_d
,
bwd_result
.
workspace_size
);
#endif
#endif
...
@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -392,15 +388,13 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search2
=
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size_w
=
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
search2
::
Find
<
T
>
(
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size_w
,
ctx
);
args2
,
exhaustive_search
,
deterministic
,
workspace_size_w
,
ctx
);
#else
#else
using
search2
=
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
deterministic
);
search2
::
Find
<
T
>
(
ctx
,
args2
,
exhaustive_search
,
deterministic
);
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
...
@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -410,12 +404,12 @@ void ConvCudnnGradKernel(const Context& ctx,
}
}
// ------------------- cudnn conv backward data ---------------------
// ------------------- cudnn conv backward data ---------------------
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
// MIOPEN ONLY support beta to be 0.0f
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
#else
#else
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
#endif
#endif
VLOG
(
4
)
<<
"Conv_grad: use_addto = "
<<
use_addto
;
VLOG
(
4
)
<<
"Conv_grad: use_addto = "
<<
use_addto
;
...
@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -515,19 +509,17 @@ void ConvCudnnGradKernel(const Context& ctx,
ctx
.
template
Alloc
<
T
>(
&
transformed_input_grad_channel
);
ctx
.
template
Alloc
<
T
>(
&
transformed_input_grad_channel
);
if
(
transformed_input_channel
.
dims
().
size
()
==
4
)
{
if
(
transformed_input_channel
.
dims
().
size
()
==
4
)
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
4
>
(
RemovePaddingSlice
<
Context
,
T
,
4
>
(
ctx
,
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
&
transformed_input_grad_channel
,
starts
,
starts
,
axes
);
axes
);
}
else
{
}
else
{
paddle
::
operators
::
RemovePaddingSlice
<
Context
,
T
,
5
>
(
RemovePaddingSlice
<
Context
,
T
,
5
>
(
ctx
,
ctx
,
&
transformed_input_grad
,
&
transformed_input_grad
,
&
transformed_input_grad_channel
,
&
transformed_input_grad_channel
,
starts
,
starts
,
axes
);
axes
);
}
}
}
}
...
@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -538,7 +530,7 @@ void ConvCudnnGradKernel(const Context& ctx,
}
}
// filter_grad do not use inplace addto.
// filter_grad do not use inplace addto.
paddle
::
operators
::
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
// ------------------- cudnn conv backward filter ---------------------
// ------------------- cudnn conv backward filter ---------------------
if
(
filter_grad
)
{
if
(
filter_grad
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
// Because beta is zero, it is unnecessary to reset filter_grad.
...
...
paddle/phi/kernels/gpudnn/conv_kernel.cu
浏览文件 @
31f57f29
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -205,15 +205,15 @@ void ConvCudnnKernel(const Context& ctx,
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
const
T
*
filter_data
=
transformed_filter_channel
.
data
<
T
>
();
// ------------------- cudnn descriptors ---------------------
// ------------------- cudnn descriptors ---------------------
paddle
::
operators
::
ConvArgs
args
{
&
transformed_input
,
ConvArgs
args
{
&
transformed_input
,
&
transformed_filter_channel
,
&
transformed_filter_channel
,
&
transformed_output
,
&
transformed_output
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations
,
dilations
,
dtype
,
dtype
,
groups
,
groups
,
compute_format
};
compute_format
};
auto
handle
=
ctx
.
cudnn_handle
();
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
...
@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -266,35 +266,35 @@ void ConvCudnnKernel(const Context& ctx,
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
if
(
compute_format
==
paddle
::
platform
::
DataLayout
::
kNHWC
)
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
i_n
,
&
i_n
,
&
i_c
,
&
i_c
,
&
i_d
,
&
i_d
,
&
i_h
,
&
i_h
,
&
i_w
);
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output
.
dims
(),
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNHWC
,
paddle
::
platform
::
DataLayout
::
kNHWC
,
&
o_n
,
&
o_n
,
&
o_c
,
&
o_c
,
&
o_d
,
&
o_d
,
&
o_h
,
&
o_h
,
&
o_w
);
&
o_w
);
}
else
{
}
else
{
paddle
::
operators
::
GetNCDHW
(
transformed_input
.
dims
(),
GetNCDHW
(
transformed_input
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
i_n
,
&
i_n
,
&
i_c
,
&
i_c
,
&
i_d
,
&
i_d
,
&
i_h
,
&
i_h
,
&
i_w
);
&
i_w
);
paddle
::
operators
::
GetNCDHW
(
transformed_output
.
dims
(),
GetNCDHW
(
transformed_output
.
dims
(),
paddle
::
platform
::
DataLayout
::
kNCHW
,
paddle
::
platform
::
DataLayout
::
kNCHW
,
&
o_n
,
&
o_n
,
&
o_c
,
&
o_c
,
&
o_d
,
&
o_d
,
&
o_h
,
&
o_h
,
&
o_w
);
&
o_w
);
}
}
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
...
@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -304,15 +304,14 @@ void ConvCudnnKernel(const Context& ctx,
size_t
workspace_size
=
0
;
// final workspace to allocate.
size_t
workspace_size
=
0
;
// final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
fwd_result
.
algo
=
search
::
Find
<
T
>
(
fwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
using
search
=
using
search
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
exhaustive_search
,
deterministic
);
fwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
exhaustive_search
,
deterministic
);
workspace_size
=
fwd_result
.
workspace_size
;
workspace_size
=
fwd_result
.
workspace_size
;
#endif
#endif
...
@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -328,8 +327,8 @@ void ConvCudnnKernel(const Context& ctx,
#endif
#endif
// ------------------- cudnn conv forward ---------------------
// ------------------- cudnn conv forward ---------------------
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
...
...
paddle/
fluid/operators
/conv_miopen_helper.h
→
paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h
浏览文件 @
31f57f29
...
@@ -14,48 +14,12 @@ limitations under the License. */
...
@@ -14,48 +14,12 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
#include "paddle/fluid/operators/conv_base_helper.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
using
ConvArgs
=
ConvArgsBase
<
miopenHandle_t
,
miopenDataType_t
>
;
using
ConvArgs
=
ConvArgsBase
<
miopenHandle_t
,
miopenDataType_t
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
*
input
,
phi
::
DenseTensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
array
<
int
,
D
>
();
auto
extents
=
Eigen
::
array
<
int
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
template
<
typename
PerfT
>
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
struct
SearchAlgorithm
{};
...
@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
...
@@ -78,7 +42,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
x
->
data
<
T
>
(),
...
@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
...
@@ -104,7 +68,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
args
.
handle
,
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
...
@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
...
@@ -134,7 +98,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
o
->
data
<
T
>
(),
...
@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
...
@@ -160,7 +124,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
args
.
handle
,
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
...
@@ -190,7 +154,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
miopenConvAlgoPerf_t
find_result
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
p
hi
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
o
->
data
<
T
>
(),
...
@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
...
@@ -216,7 +180,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
p
latform
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
p
hi
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
args
.
handle
,
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
...
@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
...
@@ -227,5 +191,4 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
}
}
};
};
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
浏览文件 @
31f57f29
...
@@ -28,11 +28,11 @@ limitations under the License. */
...
@@ -28,11 +28,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#endif
namespace
phi
{
namespace
phi
{
...
@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -173,33 +173,31 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_dout
,
ConvArgs
args1
{
&
transformed_dout
,
&
filter
,
&
filter
,
&
x_transpose
,
&
x_transpose
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
layout
};
layout
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_dout
,
ConvArgs
args2
{
&
transformed_dout
,
&
filter
,
&
filter
,
&
x_transpose
,
&
x_transpose
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
layout
};
layout
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
filter_result
;
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
filter_result
;
#endif
#endif
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
...
@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -222,14 +220,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search1
=
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
fwd_result
.
algo
=
fwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
fwd_result
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
fwd_result
.
algo
));
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
fwd_result
.
algo
));
...
@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -249,14 +245,12 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search2
=
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_result
.
algo
=
filter_result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
filter_result
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_result
.
algo
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_result
.
algo
));
...
@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -269,8 +263,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
int
dout_offset
=
int
dout_offset
=
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
dx
)
{
if
(
dx
)
{
// Because beta is zero, it is unnecessary to reset dx.
// Because beta is zero, it is unnecessary to reset dx.
...
@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -631,55 +625,53 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto
handle
=
ctx
.
cudnn_handle
();
auto
handle
=
ctx
.
cudnn_handle
();
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
paddle
::
operators
::
ConvArgs
args1
{
&
transformed_ddout_channel
,
ConvArgs
args1
{
&
transformed_ddout_channel
,
&
filter
,
&
filter
,
&
transformed_ddx
,
&
transformed_ddx
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args2
{
&
transformed_ddout_channel
,
ConvArgs
args2
{
&
transformed_ddout_channel
,
&
ddfilter
,
&
ddfilter
,
&
transformed_x
,
&
transformed_x
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args3
{
&
transformed_dout
,
ConvArgs
args3
{
&
transformed_dout
,
dfilter
,
dfilter
,
&
transformed_ddx_channel
,
&
transformed_ddx_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
GPUDNNDataLayout
::
kNCHW
};
paddle
::
operators
::
ConvArgs
args4
{
&
transformed_dout
,
ConvArgs
args4
{
&
transformed_dout
,
&
ddfilter
,
&
ddfilter
,
&
transformed_dx_channel
,
&
transformed_dx_channel
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
GPUDNNDataLayout
::
kNCHW
};
GPUDNNDataLayout
::
kNCHW
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result1
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result1
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result2
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result2
;
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
filter_result
;
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result1
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result1
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result2
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result2
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
filter_result
;
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
#endif
#endif
// ddo = conv(ddI, filter) + conv(I, ddfilter)
// ddo = conv(ddI, filter) + conv(I, ddfilter)
...
@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -702,14 +694,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search1
=
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
bwd_result1
.
algo
=
bwd_result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
bwd_result1
=
search1
::
Find
<
T
>
(
ctx
,
args1
,
false
,
deterministic
,
false
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_result1
.
algo
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_result1
.
algo
);
#endif
#endif
...
@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -726,14 +716,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search2
=
using
search2
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
bwd_result2
.
algo
=
bwd_result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
bwd_result2
=
search2
::
Find
<
T
>
(
ctx
,
args2
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_result2
.
algo
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_result2
.
algo
));
...
@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -753,14 +741,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search3
=
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_result
.
algo
=
filter_result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search3
=
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_result
=
search3
::
Find
<
T
>
(
ctx
,
args3
,
false
,
deterministic
,
false
);
filter_result
=
search3
::
Find
<
T
>
(
ctx
,
args3
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_result
.
algo
));
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_result
.
algo
));
...
@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -781,14 +767,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
paddle
::
platform
::
AllowTF32Cudnn
(),
paddle
::
platform
::
AllowTF32Cudnn
(),
c_group
);
c_group
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
using
search4
=
using
search4
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
fwd_result
.
algo
=
fwd_result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search4
=
using
search4
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_result
=
search4
::
Find
<
T
>
(
ctx
,
args4
,
false
,
deterministic
,
false
);
fwd_result
=
search4
::
Find
<
T
>
(
ctx
,
args4
,
false
,
deterministic
,
false
);
workspace_size
=
std
::
max
(
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
fwd_result
.
algo
));
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
fwd_result
.
algo
));
...
@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -796,22 +780,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
}
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
paddle
::
operators
::
GetNCDHW
(
transformed_x
.
dims
(),
GetNCDHW
(
transformed_x
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
GPUDNNDataLayout
::
kNCHW
,
&
i_n
,
&
i_n
,
&
i_c
,
&
i_c
,
&
i_d
,
&
i_d
,
&
i_h
,
&
i_h
,
&
i_w
);
&
i_w
);
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
int
o_n
,
o_c
,
o_d
,
o_h
,
o_w
;
paddle
::
operators
::
GetNCDHW
(
transformed_dout
.
dims
(),
GetNCDHW
(
transformed_dout
.
dims
(),
GPUDNNDataLayout
::
kNCHW
,
GPUDNNDataLayout
::
kNCHW
,
&
o_n
,
&
o_n
,
&
o_c
,
&
o_c
,
&
o_d
,
&
o_d
,
&
o_h
,
&
o_h
,
&
o_w
);
&
o_w
);
int
group_offset_in
=
int
group_offset_in
=
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
...
@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -819,8 +803,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
transformed_dout
.
numel
()
/
transformed_dout
.
dims
()[
0
]
/
groups
;
int
group_offset_filter
=
filter
.
numel
()
/
groups
;
int
group_offset_filter
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
wkspace_handle
=
ctx
.
cudnn_workspace_handle
();
...
...
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
浏览文件 @
31f57f29
...
@@ -26,11 +26,11 @@ limitations under the License. */
...
@@ -26,11 +26,11 @@ limitations under the License. */
#include "paddle/phi/kernels/transpose_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
#else
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h"
#endif
#endif
namespace
phi
{
namespace
phi
{
...
@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -199,15 +199,15 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
// ------------------- cudnn descriptors ---------------------
// ------------------- cudnn descriptors ---------------------
paddle
::
operators
::
ConvArgs
args
{
&
transformed_out
,
ConvArgs
args
{
&
transformed_out
,
&
filter
,
&
filter
,
&
transformed_x
,
&
transformed_x
,
strides
,
strides
,
padding_common
,
padding_common
,
dilations_
,
dilations_
,
dtype
,
dtype
,
groups
,
groups
,
data_layout
};
data_layout
};
args
.
handle
=
handle
;
args
.
handle
=
handle
;
args
.
idesc
.
set
(
transformed_out
,
iwo_groups
);
args
.
idesc
.
set
(
transformed_out
,
iwo_groups
);
args
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
args
.
wdesc
.
set
(
filter
,
layout_tensor
,
iwo_groups
);
...
@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -220,16 +220,14 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
using
search
=
using
search
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
bwd_result
.
algo
=
bwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
using
search
=
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
false
,
deterministic
,
false
);
bwd_result
=
search
::
Find
<
T
>
(
ctx
,
args
,
false
,
deterministic
,
false
);
workspace_size
=
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
bwd_result
.
algo
));
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
bwd_result
.
algo
));
...
@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -239,8 +237,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
int
x_offset
=
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
int
x_offset
=
transformed_x
.
numel
()
/
transformed_x
.
dims
()[
0
]
/
groups
;
int
out_offset
=
transformed_out
.
numel
()
/
transformed_out
.
dims
()[
0
]
/
groups
;
int
out_offset
=
transformed_out
.
numel
()
/
transformed_out
.
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
int
filter_offset
=
filter
.
numel
()
/
groups
;
paddle
::
operators
::
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
;
paddle
::
operators
::
ScalingParamType
<
T
>
beta
=
0.0
f
;
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
...
...
paddle/phi/kernels/impl/conv_cudnn_impl.h
浏览文件 @
31f57f29
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/
fluid/operators
/conv_miopen_helper.h"
#include "paddle/
phi/kernels/gpudnn
/conv_miopen_helper.h"
#else
#else
#include "paddle/
fluid/operators/conv_cudnn_helper
.h"
#include "paddle/
phi/kernels/gpudnn/conv_cudnn_v7
.h"
#endif
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录