Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
8ebcf948
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8ebcf948
编写于
9月 16, 2020
作者:
Z
zhangting2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change filter_grad algo
上级
c67c3916
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
108 addition
and
1 deletion
+108
-1
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+105
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+3
-1
未找到文件。
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
8ebcf948
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <array>
#include <array>
#include <memory>
#include <memory>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
...
@@ -90,6 +91,75 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
...
@@ -90,6 +91,75 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
return
out
;
return
out
;
}
}
inline
int
MaxBackwardFilterAlgos
(
cudnnHandle_t
cudnn_handle
)
{
int
max_algos
=
0
;
#if CUDNN_VERSION_MIN(7, 0, 1)
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
cudnn_handle
,
&
max_algos
));
#endif
return
max_algos
;
}
template
<
typename
PerfType
,
typename
AlgoType
>
void
AlgoFinalSelect
(
const
std
::
vector
<
PerfType
>&
perf_results
,
std
::
string
kernel_name
,
int32_t
algo_preference
,
size_t
workspace_byte
,
cudnnConvolutionBwdFilterAlgo_t
*
algo
,
bool
deterministic
)
{
// Determine the fastest acceptable algo that matches the algo_preference (-1
// = any),
// regardless of mathType.
VLOG
(
3
)
<<
"=========Full results of algo========="
<<
kernel_name
<<
":"
;
for
(
const
auto
&
result
:
perf_results
)
{
auto
math_type_str
=
"-"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"+"
;
}
VLOG
(
3
)
<<
" algo: "
<<
result
.
algo
<<
", TC"
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms"
<<
", wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
}
for
(
decltype
(
perf_results
.
size
())
i
=
0
;
i
!=
perf_results
.
size
();
++
i
)
{
const
auto
&
result
=
perf_results
[
i
];
bool
algo_is_tensor_core
=
false
;
algo_is_tensor_core
=
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
;
bool
algo_exclusion
=
0
;
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
(
!
deterministic
||
result
.
determinism
==
cudnnDeterminism_t
::
CUDNN_DETERMINISTIC
)
&&
(
result
.
memory
<=
workspace_byte
)
&&
(
algo_preference
==
-
1
||
algo_preference
==
result
.
algo
)
&&
!
algo_exclusion
)
{
if
((
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
&&
(
i
!=
perf_results
.
size
()
-
1
))
{
const
auto
&
next_result
=
perf_results
[
i
+
1
];
if
(
next_result
.
status
==
CUDNN_STATUS_SUCCESS
&&
next_result
.
algo
==
result
.
algo
&&
next_result
.
memory
==
result
.
memory
&&
next_result
.
mathType
!=
CUDNN_TENSOR_OP_MATH
&&
next_result
.
time
<
1.01
*
result
.
time
)
{
// Skip over this result- it's not really a Tensor Core algo.
// Prefer instead the next equivalent non-Tensor Core algo.
continue
;
}
}
*
algo
=
result
.
algo
;
auto
math_type_str
=
"-"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"+"
;
}
VLOG
(
3
)
<<
" choose algo: "
<<
result
.
algo
<<
", TC"
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms"
<<
", wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
return
;
}
}
}
using
framework
::
ConvSearchCache
;
using
framework
::
ConvSearchCache
;
struct
ConvArgs
{
struct
ConvArgs
{
...
@@ -396,6 +466,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -396,6 +466,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
algo_t
algo
;
algo_t
algo
;
if
(
!
exhaustive
&&
!
deterministic
)
{
if
(
!
exhaustive
&&
!
deterministic
)
{
#if CUDNN_VERSION >= 7001
#if CUDNN_VERSION >= 7001
/*
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
int perf_count;
int perf_count;
int best_algo_idx = 0;
int best_algo_idx = 0;
...
@@ -411,7 +482,39 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -411,7 +482,39 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
if (workspace_size > workspace_size_limit) {
if (workspace_size > workspace_size_limit) {
workspace_size = workspace_size_limit;
workspace_size = workspace_size_limit;
}
}
auto math_type_str = "-";
if ((perf_results.get())[best_algo_idx].mathType ==
CUDNN_TENSOR_OP_MATH) {
math_type_str = "+";
}
VLOG(3) << " algo: " << (perf_results.get())[best_algo_idx].algo
<< ", TC" << math_type_str
<< ", time: " << (perf_results.get())[best_algo_idx].time << " ms"
<< ", wksp = " << (perf_results.get())[best_algo_idx].memory
<< ", status = " << (perf_results.get())[best_algo_idx].status;
*/
auto
max_bwd_filt_algos
=
MaxBackwardFilterAlgos
(
args
.
handle
);
std
::
vector
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
bwd_filt_results
(
max_bwd_filt_algos
);
int
actual_bwd_filter_algos
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
bwd_filt_results
.
size
(),
&
actual_bwd_filter_algos
,
bwd_filt_results
.
data
()));
bwd_filt_results
.
resize
(
actual_bwd_filter_algos
);
AlgoFinalSelect
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
cudnnConvolutionBwdFilterAlgo_t
>
(
bwd_filt_results
,
"backprop-to-filter"
,
-
1
,
workspace_size_limit
,
&
algo
,
deterministic
);
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
workspace_size
=
workspace_size_limit
;
}
#else
#else
VLOG
(
3
)
<<
"=======cudnnGetConvolutionBackwardFilterAlgorithm====="
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
...
@@ -420,8 +523,10 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -420,8 +523,10 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
algo
));
#endif
#endif
}
else
if
(
deterministic
)
{
}
else
if
(
deterministic
)
{
VLOG
(
3
)
<<
"======choose deterministic algo======"
;
return
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
return
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
else
{
}
else
{
VLOG
(
3
)
<<
"========Get cache algo==========="
;
auto
&
dev_ctx
=
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
8ebcf948
...
@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
...
@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnGetVersion); \
__macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithm); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString); \
__macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \
__macro(cudnnCreateDropoutDescriptor); \
...
@@ -194,7 +195,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
...
@@ -194,7 +195,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录