Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3a5b5048
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3a5b5048
编写于
9月 14, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Simplify the codes of conv. (#45966)
上级
62176f63
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
104 addition
and
167 deletion
+104
-167
paddle/fluid/operators/conv_base_helper.h
paddle/fluid/operators/conv_base_helper.h
+0
-4
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+97
-163
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+3
-0
paddle/phi/kernels/autotune/cache.h
paddle/phi/kernels/autotune/cache.h
+4
-0
未找到文件。
paddle/fluid/operators/conv_base_helper.h
浏览文件 @
3a5b5048
...
...
@@ -36,10 +36,6 @@ using framework::ConvSearchCache;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
// As the basic for SearchAlgorithm struct.
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
// As the container of searchAlgorithm::Find() result.
template
<
typename
AlgoT
>
struct
SearchResult
{
...
...
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
3a5b5048
...
...
@@ -146,83 +146,19 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
}
template
<
typename
PerfT
>
struct
SearchAlgorithmBase
{};
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionFwdAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionFwdAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionFwdAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvForward
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvForward
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionFwdAlgo_t
algo
)
{
...
...
@@ -239,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
static_cast
<
AlgoT
>
(
1
));
return
SearchResult
<
AlgoT
>
(
static_cast
<
AlgoT
>
(
1
),
-
1.0
,
workspace_size
);
...
...
@@ -271,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
if
(
result
.
workspace_size
>
workspace_size_limit
)
{
#if CUDNN_VERSION >= 8000
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
"[Heuristic] FwdAlgo Perf result"
,
perf_results
,
actual_perf_count
,
workspace_size_limit
);
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
...
...
@@ -387,53 +327,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionBwdDataAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionBwdDataAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardData
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvBackwardData
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdDataAlgo_t
algo
)
{
...
...
@@ -450,7 +348,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
);
...
...
@@ -609,54 +507,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionBwdFilterAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionBwdFilterAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardFilter
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvBackwardFilter
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
...
...
@@ -674,7 +529,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
);
...
...
@@ -891,5 +746,84 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
};
template
<
typename
PerfT
>
struct
SearchAlgorithm
:
public
SearchAlgorithmBase
<
PerfT
>
{
using
AlgoT
=
typename
SearchAlgorithmBase
<
PerfT
>::
AlgoT
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConv
(
SearchAlgorithmBase
<
PerfT
>::
kAlgoType
);
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
template
FindAlgoExhaustiveSearch
<
T
>(
args
,
ctx
);
}
else
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"Enable Tensor Core for FLOAT16"
;
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
VLOG
(
5
)
<<
"Enable Tensor Core for BFLOAT16"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
VLOG
(
5
)
<<
"Disable TensorFloat (Tensor Core) for FLOAT"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
}
#endif
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_miopen_helper.h
浏览文件 @
3a5b5048
...
...
@@ -55,6 +55,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
template
<
>
struct
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
...
...
paddle/phi/kernels/autotune/cache.h
浏览文件 @
3a5b5048
...
...
@@ -289,6 +289,10 @@ class AutoTuneCache {
return
auto_tune_map_
[
static_cast
<
int64_t
>
(
algo_type
)];
}
CudnnAlgorithmsCacheMap
&
GetConv
(
const
AlgorithmType
&
algo_type
)
{
return
cudnn_auto_tune_map_
[
static_cast
<
int64_t
>
(
algo_type
)];
}
CudnnAlgorithmsCacheMap
&
GetConvForward
()
{
return
cudnn_auto_tune_map_
[
static_cast
<
int64_t
>
(
AlgorithmType
::
kConvForward
)];
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录