Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
924aac22
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
924aac22
编写于
1月 11, 2021
作者:
A
AshburnLee
提交者:
GitHub
1月 11, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add tf32 switch for cuDNN (#29192)
上级
8ce2482b
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
113 addition
and
20 deletion
+113
-20
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+21
-9
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+14
-7
paddle/fluid/operators/conv_transpose_cudnn_op.cu
paddle/fluid/operators/conv_transpose_cudnn_op.cu
+6
-3
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+7
-0
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+7
-0
paddle/fluid/platform/cudnn_desc.h
paddle/fluid/platform/cudnn_desc.h
+10
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+4
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+4
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-0
python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
+38
-0
未找到文件。
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
924aac22
...
...
@@ -210,16 +210,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
args
.
cdesc
.
allow_tf32_
)
{
#if CUDA_VERSION >= 11000
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_
DEFAULT
_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
CUDNN_
FMA
_MATH
));
#endif // CUDA_VERSION >= 11000
}
#endif
...
...
@@ -340,16 +344,20 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
algo_t
algo
;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
args
.
cdesc
.
allow_tf32_
)
{
#if CUDA_VERSION >= 11000
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_
DEFAULT
_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
CUDNN_
FMA
_MATH
));
#endif // CUDA_VERSION >= 11000
}
#endif
...
...
@@ -485,16 +493,20 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
args
.
cdesc
.
allow_tf32_
)
{
#if CUDA_VERSION >= 11000
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_
DEFAULT
_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
CUDNN_
FMA
_MATH
));
#endif // CUDA_VERSION >= 11000
}
#endif
...
...
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
924aac22
...
...
@@ -240,7 +240,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
auto
layout_format
=
GetCudnnTensorFormat
(
layout
);
args
.
handle
=
handle
;
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
);
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
());
#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it manually
...
...
@@ -603,7 +604,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
args1
.
idesc
.
set
(
transformed_input_grad
,
layout_tensor
);
args1
.
wdesc
.
set
(
transformed_filter_channel
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_algo
=
...
...
@@ -620,7 +622,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
args2
.
wdesc
.
set
(
transformed_filter_grad_channel
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
transformed_output_grad_channel
,
layout_tensor
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_algo
=
...
...
@@ -980,7 +983,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args1
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args1
.
wdesc
.
set
(
*
W
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
ctx
);
...
...
@@ -995,7 +999,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
using
search2
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
ctx
);
...
...
@@ -1012,7 +1017,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_algo
=
...
...
@@ -1028,7 +1034,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args4
.
idesc
.
set
(
transformed_dX
,
iwo_group
);
args4
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args4
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
args4
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
args4
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
using
search4
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_algo
=
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu
浏览文件 @
924aac22
...
...
@@ -232,7 +232,8 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
args
.
idesc
.
set
(
transformed_output
,
iwo_groups
);
args
.
wdesc
.
set
(
*
filter
,
layout_tensor
,
iwo_groups
);
args
.
odesc
.
set
(
transformed_input
,
iwo_groups
);
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
...
...
@@ -468,7 +469,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
args1
.
idesc
.
set
(
transformed_output_grad
,
iwo_groups
);
args1
.
wdesc
.
set
(
*
filter
,
layout_tensor
,
iwo_groups
);
args1
.
odesc
.
set
(
input_transpose
,
iwo_groups
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
...
...
@@ -481,7 +483,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
args2
.
idesc
.
set
(
transformed_output_grad
,
iwo_groups
);
args2
.
wdesc
.
set
(
*
filter_grad
,
layout_tensor
,
iwo_groups
);
args2
.
odesc
.
set
(
input_transpose
,
iwo_groups
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_groups
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
924aac22
...
...
@@ -200,6 +200,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
#if CUDNN_VERSION >= 11000
if
(
!
platform
::
allow_tf32_cudnn
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_FMA_MATH
));
}
#endif // CUDA_VERSION >= 11000
auto
x_dims
=
framework
::
vectorize
(
transformed_input
.
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
924aac22
...
...
@@ -153,6 +153,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
conv_desc
[
i
],
CUDNN_DEFAULT_MATH
));
#if CUDNN_VERSION >= 11000
if
(
!
platform
::
allow_tf32_cudnn
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
conv_desc
[
i
],
CUDNN_FMA_MATH
));
}
#endif // CUDA_VERSION >= 11000
}
in_dims
[
2
][
1
]
*=
2
;
in_strides
[
2
][
0
]
=
oc
*
h
*
w
;
...
...
paddle/fluid/platform/cudnn_desc.h
浏览文件 @
924aac22
...
...
@@ -24,6 +24,7 @@
#include <vector>
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -229,7 +230,8 @@ class ConvolutionDescriptor {
void
set
(
cudnnDataType_t
dtype
,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
groups
=
1
)
{
bool
allow_tf32
,
const
int
groups
=
1
)
{
allow_tf32_
=
allow_tf32
;
cudnnDataType_t
compute_type
=
(
dtype
==
CUDNN_DATA_DOUBLE
)
?
CUDNN_DATA_DOUBLE
:
CUDNN_DATA_FLOAT
;
T
*
desc
=
desc_
.
get
();
...
...
@@ -246,11 +248,18 @@ class ConvolutionDescriptor {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
desc
,
CUDNN_TENSOR_OP_MATH
));
}
else
if
(
dtype
==
CUDNN_DATA_FLOAT
&&
!
allow_tf32
)
{
#if CUDA_VERSION >= 11000
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
desc
,
CUDNN_FMA_MATH
));
#endif // CUDA_VERSION >= 11000
}
#endif
#endif
}
bool
allow_tf32_
;
private:
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
924aac22
...
...
@@ -74,6 +74,10 @@ namespace platform {
bool
allow_tf32_cublas
=
true
;
void
SetAllowTF32Cublas
(
bool
active
)
{
allow_tf32_cublas
=
active
;
}
bool
AllowTF32Cublas
()
{
return
allow_tf32_cublas
;
}
bool
allow_tf32_cudnn
=
true
;
void
SetAllowTF32Cudnn
(
bool
active
)
{
allow_tf32_cudnn
=
active
;
}
bool
AllowTF32Cudnn
()
{
return
allow_tf32_cudnn
;
}
#endif // PADDLE_WITH_CUDA
DeviceContextPool
*
DeviceContextPool
::
pool
=
nullptr
;
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
924aac22
...
...
@@ -67,6 +67,10 @@ namespace platform {
void
SetAllowTF32Cublas
(
bool
active
);
/*Get the global variable allow_tf32_cublas value*/
bool
AllowTF32Cublas
();
/*Set the value of the global variable allow_tf32_cudnn*/
void
SetAllowTF32Cudnn
(
bool
active
);
/*Get the global variable allow_tf32_cudnn value*/
bool
AllowTF32Cudnn
();
#endif // PADDLE_WITH_CUDA
enum
DeviceType
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
924aac22
...
...
@@ -1988,6 +1988,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_CUDA
m
.
def
(
"set_cublas_switch"
,
platform
::
SetAllowTF32Cublas
);
m
.
def
(
"get_cublas_switch"
,
platform
::
AllowTF32Cublas
);
m
.
def
(
"set_cudnn_switch"
,
platform
::
SetAllowTF32Cudnn
);
m
.
def
(
"get_cudnn_switch"
,
platform
::
AllowTF32Cudnn
);
#endif // PADDLE_WITH_CUDA
using
VarQuantScale
=
...
...
python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
0 → 100644
浏览文件 @
924aac22
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
six
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
class
TestTF32Switch
(
unittest
.
TestCase
):
def
test_on_off
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
assertTrue
(
core
.
get_cudnn_switch
())
# default
core
.
set_cudnn_switch
(
0
)
self
.
assertFalse
(
core
.
get_cudnn_switch
())
# turn off
core
.
set_cudnn_switch
(
1
)
self
.
assertTrue
(
core
.
get_cudnn_switch
())
# turn on
core
.
set_cudnn_switch
(
1
)
# restore the switch
else
:
pass
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录