Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ac75a9a6
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
ac75a9a6
编写于
1月 04, 2023
作者:
Y
Yuanle Liu
提交者:
GitHub
1月 04, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Paddle Inference] fix mixed precision diff (#49475)
上级
04aa80e6
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
115 addition
and
78 deletion
+115
-78
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+107
-60
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
+4
-8
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
...luid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+3
-9
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+1
-1
未找到文件。
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
浏览文件 @
ac75a9a6
...
...
@@ -78,11 +78,11 @@ inline bool VarNodeHasDtype(Node* var_node) {
(
type
==
VarType
::
VOCAB
);
}
inline
bool
IsF
loatType
(
VarType
::
Type
type
)
{
inline
bool
IsF
P32AndFP64
(
VarType
::
Type
type
)
{
return
(
type
==
VarType
::
FP64
)
||
(
type
==
VarType
::
FP32
);
}
inline
bool
Is
HalfType
(
VarType
::
Type
type
)
{
inline
bool
Is
FP16AndBFP16
(
VarType
::
Type
type
)
{
return
(
type
==
VarType
::
FP16
)
||
(
type
==
VarType
::
BF16
);
}
...
...
@@ -159,23 +159,14 @@ bool OpSupportPrecision(const std::string& op_type,
// The set of ops that support fp16 calculation and are considered
// numerically-dangerous, slower and whose effects may also be observed in
// downstream ops.
// ref to python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
void
AutoMixedPrecisionPass
::
SetDefaultBlacklist
()
const
{
black_list_
.
insert
({
// numerically-dangerous
"acos"
,
"asin"
,
"cosh"
,
"tan"
,
"exp"
,
"expm1"
,
"square"
,
"log"
,
"log2"
,
"log10"
,
"log1p"
,
"logsumexp"
,
"mean"
,
"rsqrt"
,
"sum"
,
"cos_sim"
,
"softmax_with_cross_entropy"
,
...
...
@@ -271,6 +262,9 @@ void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const {
VLOG
(
4
)
<<
"InsertCastOp done"
;
RestoreOpOriginType
();
VLOG
(
4
)
<<
"RestoreOpOriginType done"
;
LOG
(
INFO
)
<<
"The number of ops run at low precision ["
<<
op_run_low_precision_
.
size
()
<<
"/"
<<
op_original_type_
.
size
()
<<
"]"
;
}
void
AutoMixedPrecisionPass
::
SetOpUniqueType
()
const
{
...
...
@@ -314,11 +308,26 @@ void AutoMixedPrecisionPass::ProcessOpWithDtypeAttr() const {
for
(
const
auto
&
nodes
:
all_op_nodes_
)
{
for
(
auto
*
op_node
:
nodes
)
{
auto
op_type
=
op_node
->
Op
()
->
Type
();
if
(
op_node
->
Op
()
->
HasAttr
(
"in_dtype"
))
{
auto
*
var_node
=
op_node
->
inputs
[
0
];
auto
*
real_var_node
=
real_vars_
[
var_node
->
Var
()
->
Name
()];
if
(
IsFP16AndBFP16
(
real_var_node
->
Var
()
->
GetDataType
()))
{
op_node
->
Op
()
->
SetAttr
(
"in_dtype"
,
static_cast
<
int
>
(
framework
::
TransToProtoVarType
(
low_precision_
)));
op_node
->
Op
()
->
Flush
();
VLOG
(
4
)
<<
"process op with in_dtype attr: "
<<
op_type
<<
" ( "
<<
static_cast
<
int
>
(
real_var_node
->
Var
()
->
GetDataType
())
<<
" --->"
<<
static_cast
<
int
>
(
low_precision_
)
<<
" )"
;
}
}
if
(
op_run_low_precision_
.
count
(
op_type
)
==
0
)
continue
;
if
(
op_node
->
Op
()
->
HasAttr
(
"dtype"
))
{
auto
dtype
=
op_node
->
Op
()
->
GetAttrIfExists
<
int
>
(
"dtype"
);
if
(
IsF
loatType
(
static_cast
<
VarType
::
Type
>
(
dtype
)))
{
if
(
IsF
P32AndFP64
(
static_cast
<
VarType
::
Type
>
(
dtype
)))
{
op_node
->
Op
()
->
SetAttr
(
"dtype"
,
static_cast
<
int
>
(
framework
::
TransToProtoVarType
(
low_precision_
)));
...
...
@@ -326,10 +335,9 @@ void AutoMixedPrecisionPass::ProcessOpWithDtypeAttr() const {
VLOG
(
4
)
<<
"process op with dtype attr: "
<<
op_type
<<
" ( "
<<
dtype
<<
" --->"
<<
static_cast
<
int
>
(
low_precision_
)
<<
" )"
;
}
}
if
(
op_node
->
Op
()
->
HasAttr
(
"out_dtype"
))
{
}
else
if
(
op_node
->
Op
()
->
HasAttr
(
"out_dtype"
))
{
auto
out_dtype
=
op_node
->
Op
()
->
GetAttrIfExists
<
int
>
(
"out_dtype"
);
if
(
IsF
loatType
(
static_cast
<
VarType
::
Type
>
(
out_dtype
)))
{
if
(
IsF
P32AndFP64
(
static_cast
<
VarType
::
Type
>
(
out_dtype
)))
{
op_node
->
Op
()
->
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
framework
::
TransToProtoVarType
(
low_precision_
)));
...
...
@@ -358,14 +366,34 @@ void AutoMixedPrecisionPass::GetOpPrecision() const {
if
(
op_node
->
Op
()
->
HasAttr
(
"dtype"
))
{
auto
dtype
=
op_node
->
Op
()
->
GetAttrIfExists
<
int
>
(
"dtype"
);
support_low_precision
=
support_low_precision
&&
IsFloatType
(
static_cast
<
VarType
::
Type
>
(
dtype
));
support_low_precision
=
support_low_precision
&&
IsFP32AndFP64
(
static_cast
<
VarType
::
Type
>
(
dtype
));
}
else
if
(
op_node
->
Op
()
->
HasAttr
(
"out_dtype"
))
{
auto
out_dtype
=
op_node
->
Op
()
->
GetAttrIfExists
<
int
>
(
"out_dtype"
);
support_low_precision
=
support_low_precision
&&
IsFloatType
(
static_cast
<
VarType
::
Type
>
(
out_dtype
));
}
else
{
IsFP32AndFP64
(
static_cast
<
VarType
::
Type
>
(
out_dtype
));
}
// If scale op's "scale" and "bias" attr value exceed the range of fp16
// and bf16, it cannot run at low precision.
if
(
GetOpOriginalType
(
op_node
->
Op
()
->
Type
())
==
"scale"
)
{
auto
scale
=
op_node
->
Op
()
->
GetAttrIfExists
<
float
>
(
"scale"
);
auto
bias
=
op_node
->
Op
()
->
GetAttrIfExists
<
float
>
(
"bias"
);
if
(
low_precision_
==
phi
::
DataType
::
FLOAT16
)
{
support_low_precision
=
support_low_precision
&&
phi
::
dtype
::
isfinite
(
static_cast
<
phi
::
dtype
::
float16
>
(
scale
))
&&
phi
::
dtype
::
isfinite
(
static_cast
<
phi
::
dtype
::
float16
>
(
bias
));
}
else
if
(
low_precision_
==
phi
::
DataType
::
BFLOAT16
)
{
support_low_precision
=
support_low_precision
&&
phi
::
dtype
::
isfinite
(
static_cast
<
phi
::
dtype
::
bfloat16
>
(
scale
))
&&
phi
::
dtype
::
isfinite
(
static_cast
<
phi
::
dtype
::
bfloat16
>
(
bias
));
}
}
// if op's input var and output var is not dense tensor, the op should
// not run at low precision.
for
(
auto
*
in_var_node
:
op_node
->
inputs
)
{
...
...
@@ -377,7 +405,6 @@ void AutoMixedPrecisionPass::GetOpPrecision() const {
support_low_precision
&&
(
real_in_var_node
->
Var
()
->
GetType
()
==
VarType
::
LOD_TENSOR
);
}
for
(
auto
*
out_var_node
:
op_node
->
outputs
)
{
CHECK_EQ
(
out_var_node
->
IsVar
(),
true
);
auto
*
real_out_var_node
=
real_vars_
[
out_var_node
->
Var
()
->
Name
()];
...
...
@@ -387,7 +414,6 @@ void AutoMixedPrecisionPass::GetOpPrecision() const {
support_low_precision
&&
(
real_out_var_node
->
Var
()
->
GetType
()
==
VarType
::
LOD_TENSOR
);
}
}
if
(
support_low_precision
)
{
op_run_low_precision_
.
insert
(
op_type
);
...
...
@@ -438,7 +464,7 @@ void AutoMixedPrecisionPass::UpdateOpPrecision() const {
}
// when op_1 only support cpu kernel. if op_2's intput var is op_1's
// output var, then op_2 should not run
half
.
// output var, then op_2 should not run
at low precision
.
if
(
GetOpOriginalType
(
op_type
)
!=
"feed"
&&
!
GpuKernelSupportPrecision
(
GetOpOriginalType
(
op_type
),
phi
::
DataType
::
FLOAT32
))
{
...
...
@@ -596,7 +622,7 @@ void AutoMixedPrecisionPass::SetVarPrecision() const {
auto
*
real_in_var_node
=
real_vars_
[
in_var_node
->
Var
()
->
Name
()];
auto
in_var_name
=
real_in_var_node
->
Var
()
->
Name
();
if
(
!
IsF
loatType
(
real_in_var_node
->
Var
()
->
GetDataType
()))
continue
;
if
(
!
IsF
P32AndFP64
(
real_in_var_node
->
Var
()
->
GetDataType
()))
continue
;
if
(
!
VarNodeHasDtype
(
real_in_var_node
))
continue
;
if
(
InputVarsNotConvert
(
op_node
,
in_var_name
))
continue
;
...
...
@@ -615,7 +641,7 @@ void AutoMixedPrecisionPass::SetVarPrecision() const {
auto
*
real_out_var_node
=
real_vars_
[
out_var_node
->
Var
()
->
Name
()];
auto
out_var_name
=
real_out_var_node
->
Var
()
->
Name
();
if
(
!
IsF
loatType
(
real_out_var_node
->
Var
()
->
GetDataType
()))
continue
;
if
(
!
IsF
P32AndFP64
(
real_out_var_node
->
Var
()
->
GetDataType
()))
continue
;
if
(
!
VarNodeHasDtype
(
real_out_var_node
))
continue
;
if
(
OutputVarsNotConvert
(
op_node
,
out_var_name
))
continue
;
...
...
@@ -655,7 +681,7 @@ void AutoMixedPrecisionPass::ConvertWeightsData() const {
auto
var_names
=
scope
->
LocalVarNames
();
for
(
const
auto
&
var_name
:
var_names
)
{
if
(
vars_convert_to_low_precision_
.
count
(
var_name
))
{
VLOG
(
4
)
<<
var_name
<<
"'s data type was convert to
half
"
;
VLOG
(
4
)
<<
var_name
<<
"'s data type was convert to
low precision
"
;
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
CHECK_EQ
(
var
->
IsType
<
phi
::
DenseTensor
>
(),
true
);
...
...
@@ -682,16 +708,18 @@ void AutoMixedPrecisionPass::ConvertWeightsData() const {
}
}
}
else
if
(
low_precision_
==
phi
::
DataType
::
BFLOAT16
)
{
auto
*
half
_data
=
auto
*
low_precision
_data
=
low_precision_tensor
.
mutable_data
<
phi
::
dtype
::
bfloat16
>
(
phi
::
CPUPlace
{});
for
(
int64_t
i
=
0
;
i
<
origin_tensor
->
numel
();
i
++
)
{
if
(
origin_tensor
->
dtype
()
==
phi
::
DataType
::
FLOAT64
)
{
auto
*
origin_data
=
origin_tensor
->
data
<
double
>
();
half_data
[
i
]
=
static_cast
<
phi
::
dtype
::
bfloat16
>
(
origin_data
[
i
]);
low_precision_data
[
i
]
=
static_cast
<
phi
::
dtype
::
bfloat16
>
(
origin_data
[
i
]);
}
else
if
(
origin_tensor
->
dtype
()
==
phi
::
DataType
::
FLOAT32
)
{
auto
*
origin_data
=
origin_tensor
->
data
<
float
>
();
half_data
[
i
]
=
static_cast
<
phi
::
dtype
::
bfloat16
>
(
origin_data
[
i
]);
low_precision_data
[
i
]
=
static_cast
<
phi
::
dtype
::
bfloat16
>
(
origin_data
[
i
]);
}
}
}
...
...
@@ -731,27 +759,46 @@ void AutoMixedPrecisionPass::InsertCastOp() const {
VLOG
(
4
)
<<
"process var: "
<<
real_in_var_node
->
Var
()
->
Name
()
<<
" with type "
<<
in_var_type
;
if
(
IsFloatType
(
in_var_type
)
&&
op_run_low_precision_
.
count
(
op_type
))
{
if
(
IsFP32AndFP64
(
in_var_type
)
&&
op_run_low_precision_
.
count
(
op_type
))
{
auto
to_type
=
framework
::
TransToProtoVarType
(
low_precision_
);
auto
*
prev_op
=
in_var_node
->
inputs
.
empty
()
?
nullptr
:
in_var_node
->
inputs
[
0
];
if
(
prev_op
&&
GetOpOriginalType
(
prev_op
->
Op
()
->
Type
())
==
"cast"
)
{
in_var_node
->
Var
()
->
SetDataType
(
to_type
);
prev_op
->
Op
()
->
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
to_type
));
prev_op
->
Op
()
->
Flush
();
}
else
{
DoInsertCastOp
(
subgraphes_
[
i
],
in_var_node
,
op_node
,
in_var_type
,
framework
::
TransToProtoVarType
(
low_precision_
)
,
to_type
,
block_desc
,
&
suffix
,
&
cache
);
}
else
if
(
IsHalfType
(
in_var_type
)
&&
}
}
else
if
(
IsFP16AndBFP16
(
in_var_type
)
&&
op_run_low_precision_
.
count
(
op_type
)
==
0
)
{
auto
to_type
=
VarType
::
FP32
;
auto
*
prev_op
=
in_var_node
->
inputs
.
empty
()
?
nullptr
:
in_var_node
->
inputs
[
0
];
if
(
prev_op
&&
GetOpOriginalType
(
prev_op
->
Op
()
->
Type
())
==
"cast"
)
{
in_var_node
->
Var
()
->
SetDataType
(
to_type
);
prev_op
->
Op
()
->
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
to_type
));
prev_op
->
Op
()
->
Flush
();
}
else
{
DoInsertCastOp
(
subgraphes_
[
i
],
in_var_node
,
op_node
,
in_var_type
,
VarType
::
FP32
,
to_type
,
block_desc
,
&
suffix
,
&
cache
);
}
}
}
// Special op.
// fused_multi_transformer's input(CacheKV) and output(CacheKVOut) vars
...
...
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
浏览文件 @
ac75a9a6
...
...
@@ -164,7 +164,7 @@ TEST(Ernie_gpu_fp16_no_ir, compare_results) {
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
5e-2
);
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
8e-3
);
}
}
}
...
...
@@ -175,8 +175,6 @@ TEST(Ernie_gpu_fp16_with_ir, compare_results) {
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kHalf
);
config
.
SwitchIrOptim
(
true
);
// The fc_fuse_pass has diff, which will be repaired later.
config
.
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config
.
pass_builder
()
->
DeletePass
(
"constant_folding_pass"
);
...
...
@@ -206,7 +204,7 @@ TEST(Ernie_gpu_fp16_with_ir, compare_results) {
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
5
e-2
);
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
2
e-2
);
}
}
}
...
...
@@ -243,7 +241,7 @@ TEST(Ernie_gpu_bf16_no_ir, compare_results) {
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
7
e-2
);
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
1
e-2
);
}
}
}
...
...
@@ -254,8 +252,6 @@ TEST(Ernie_gpu_bf16_with_ir, compare_results) {
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kBf16
);
config
.
SwitchIrOptim
(
true
);
// The fc_fuse_pass has diff, which will be repaired later.
config
.
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config
.
pass_builder
()
->
DeletePass
(
"constant_folding_pass"
);
...
...
@@ -285,7 +281,7 @@ TEST(Ernie_gpu_bf16_with_ir, compare_results) {
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
7e-2
);
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
5e-3
);
}
}
}
...
...
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
浏览文件 @
ac75a9a6
...
...
@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
// For layer_norm, reduce to calculate mean and std
sum_i
+=
static_cast
<
float
>
(
tmp_3
);
#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530
square_sum_i
+=
static_cast
<
float
>
(
__hmul
(
tmp_3
,
tmp_3
));
#elif defined(PADDLE_WITH_CUDA)
square_sum_i
+=
static_cast
<
float
>
(
tmp_3
)
*
static_cast
<
float
>
(
tmp_3
);
#else
square_sum_i
+=
static_cast
<
float
>
(
tmp_3
*
tmp_3
);
#endif
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
float
>
(
sum_i
,
square_sum_i
),
...
...
@@ -282,7 +276,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
half
tmp_0
=
__hdiv
(
__hsub
(
save_ptr
[
save_index
],
mean_i
),
std_i
);
half
tmp_1
=
scale
?
__hmul
(
scale
[
j
],
tmp_0
)
:
tmp_0
;
#else
half
tmp_0
=
static_cast
<
float
>
(
static_cast
<
float
>
(
save_ptr
[
save_index
])
+
half
tmp_0
=
static_cast
<
half
>
(
static_cast
<
float
>
(
save_ptr
[
save_index
])
-
static_cast
<
float
>
(
mean_i
)
/
static_cast
<
float
>
(
std_i
));
half
tmp_1
=
scale
?
static_cast
<
half
>
(
static_cast
<
float
>
(
scale
[
j
])
*
...
...
paddle/phi/kernels/funcs/fc_functor.cu
浏览文件 @
ac75a9a6
...
...
@@ -164,7 +164,7 @@ __global__ void bias_relu_v4_half2(const int num,
data_vec
[
unroll_idx
]
=
__hmax2
(
__half2
(
0
,
0
),
data_vec
[
unroll_idx
]);
#elif __CUDA_ARCH__ >= 530
data_vec
[
unroll_idx
]
=
__hmul2
(
__hgt2
(
__half2
(
0
,
0
),
data_vec
[
unroll_idx
]
),
data_vec
[
unroll_idx
]);
__hgt2
(
data_vec
[
unroll_idx
],
__half2
(
0
,
0
)
),
data_vec
[
unroll_idx
]);
#else
data_vec
[
unroll_idx
].
x
=
static_cast
<
int
>
(
static_cast
<
float
>
(
data_vec
[
unroll_idx
].
x
)
>
0
)
*
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录