Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
3a5b5048
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3a5b5048
编写于
9月 14, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Simplify the codes of conv. (#45966)
上级
62176f63
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
104 addition
and
167 deletion
+104
-167
paddle/fluid/operators/conv_base_helper.h
paddle/fluid/operators/conv_base_helper.h
+0
-4
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+97
-163
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+3
-0
paddle/phi/kernels/autotune/cache.h
paddle/phi/kernels/autotune/cache.h
+4
-0
未找到文件。
paddle/fluid/operators/conv_base_helper.h
浏览文件 @
3a5b5048
...
...
@@ -36,10 +36,6 @@ using framework::ConvSearchCache;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
// As the basic for SearchAlgorithm struct.
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
// As the container of searchAlgorithm::Find() result.
template
<
typename
AlgoT
>
struct
SearchResult
{
...
...
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
3a5b5048
...
...
@@ -146,83 +146,19 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
}
template
<
typename
PerfT
>
struct
SearchAlgorithmBase
{};
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionFwdAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionFwdAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionFwdAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvForward
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvForward
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionFwdAlgo_t
algo
)
{
...
...
@@ -239,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
static_cast
<
AlgoT
>
(
1
));
return
SearchResult
<
AlgoT
>
(
static_cast
<
AlgoT
>
(
1
),
-
1.0
,
workspace_size
);
...
...
@@ -271,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
if
(
result
.
workspace_size
>
workspace_size_limit
)
{
#if CUDNN_VERSION >= 8000
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
"[Heuristic] FwdAlgo Perf result"
,
perf_results
,
actual_perf_count
,
workspace_size_limit
);
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
...
...
@@ -387,53 +327,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionBwdDataAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionBwdDataAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardData
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvBackwardData
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdDataAlgo_t
algo
)
{
...
...
@@ -450,7 +348,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
);
...
...
@@ -609,54 +507,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
struct
SearchAlgorithm
Base
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
using
PerfT
=
cudnnConvolutionBwdFilterAlgoPerf_t
;
using
AlgoT
=
cudnnConvolutionBwdFilterAlgo_t
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardFilter
();
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
constexpr
static
phi
::
autotune
::
AlgorithmType
kAlgoType
=
phi
::
autotune
::
AlgorithmType
::
kConvBackwardFilter
;
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
...
...
@@ -674,7 +529,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
return
workspace_size
;
}
pr
ivate
:
pr
otected
:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
(
const
ConvArgs
&
args
)
{
auto
workspace_size
=
GetWorkspaceSize
(
args
,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
);
...
...
@@ -891,5 +746,84 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
};
template
<
typename
PerfT
>
struct
SearchAlgorithm
:
public
SearchAlgorithmBase
<
PerfT
>
{
using
AlgoT
=
typename
SearchAlgorithmBase
<
PerfT
>::
AlgoT
;
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
deterministic
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoDeterministic
(
args
);
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto
key
=
args
.
Convert2ConvCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConv
(
SearchAlgorithmBase
<
PerfT
>::
kAlgoType
);
if
(
cache
.
Find
(
key
))
{
auto
t
=
cache
.
Get
(
key
);
result
.
algo
=
static_cast
<
AlgoT
>
(
t
.
algo
);
result
.
workspace_size
=
t
.
workspace_size
;
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
SearchAlgorithmBase
<
PerfT
>::
template
FindAlgoExhaustiveSearch
<
T
>(
args
,
ctx
);
}
else
{
result
=
SearchAlgorithmBase
<
PerfT
>::
FindAlgoHeuristic
(
args
,
ctx
);
}
phi
::
autotune
::
DnnNode
node
(
static_cast
<
int64_t
>
(
result
.
algo
),
result
.
workspace_size
);
cache
.
Set
(
key
,
node
);
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
result
.
workspace_size
)
<<
" MB"
;
return
result
;
}
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"Enable Tensor Core for FLOAT16"
;
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
VLOG
(
5
)
<<
"Enable Tensor Core for BFLOAT16"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
cdesc
.
allow_tf32_
)
{
VLOG
(
5
)
<<
"Disable TensorFloat (Tensor Core) for FLOAT"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
}
#endif
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_miopen_helper.h
浏览文件 @
3a5b5048
...
...
@@ -55,6 +55,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
template
<
>
struct
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
...
...
paddle/phi/kernels/autotune/cache.h
浏览文件 @
3a5b5048
...
...
@@ -289,6 +289,10 @@ class AutoTuneCache {
return
auto_tune_map_
[
static_cast
<
int64_t
>
(
algo_type
)];
}
CudnnAlgorithmsCacheMap
&
GetConv
(
const
AlgorithmType
&
algo_type
)
{
return
cudnn_auto_tune_map_
[
static_cast
<
int64_t
>
(
algo_type
)];
}
CudnnAlgorithmsCacheMap
&
GetConvForward
()
{
return
cudnn_auto_tune_map_
[
static_cast
<
int64_t
>
(
AlgorithmType
::
kConvForward
)];
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录