Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5b4f8aac
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5b4f8aac
编写于
3月 04, 2021
作者:
J
jakpiase
提交者:
GitHub
3月 04, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Added LSTM BF16 and fixed GRU BF16 (#31234)
上级
7cdf6ea7
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
322 addition
and
58 deletion
+322
-58
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+4
-0
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+44
-20
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+53
-21
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+8
-10
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
.../tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+31
-7
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
.../tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+2
-0
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
...tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+159
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+19
-0
tools/static_mode_white_list.py
tools/static_mode_white_list.py
+2
-0
未找到文件。
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
5b4f8aac
...
@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() {
...
@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() {
AddAttr
<
bool
>
(
"use_mkldnn"
,
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"force_fp32_output"
,
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator.
Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op.
This operator fuse the X into LSTM, more details can refer to LSTM op.
...
...
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
浏览文件 @
5b4f8aac
...
@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
}
}
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
const
bool
origin_mode
)
{
const
bool
origin_mode
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
...
@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
auto
*
weight_x_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
U
>
(),
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
if
(
origin_mode
==
false
)
{
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
IC
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
IC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
weight_x_data
[
j
]
*=
-
1
;
U
minus_one
(
-
1.0
f
);
weight_x_data
[
j
]
=
minus_one
*
weight_x_data
[
j
];
}
}
weight_x_data
+=
3
*
this
->
OC
;
weight_x_data
+=
3
*
this
->
OC
;
}
}
...
@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
return
memory_p
;
return
memory_p
;
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
const
bool
origin_mode
)
{
const
bool
origin_mode
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
...
@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// oneDNN format [OC, 3OC]
// oneDNN format [OC, 3OC]
auto
*
weight_h_data
=
auto
*
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
auto
*
user_weight_h_data
=
weight_h
->
data
<
U
>
();
auto
*
user_weight_h_data
=
weight_h
->
data
<
float
>
();
auto
src1_iter
=
user_weight_h_data
;
auto
src1_iter
=
user_weight_h_data
;
auto
src2_iter
=
user_weight_h_data
+
2
*
this
->
OC
*
this
->
OC
;
auto
src2_iter
=
user_weight_h_data
+
2
*
this
->
OC
*
this
->
OC
;
for
(
int64_t
c
=
0
;
c
<
this
->
OC
;
++
c
)
{
for
(
int64_t
c
=
0
;
c
<
this
->
OC
;
++
c
)
{
memcpy
(
weight_h_data
,
src1_iter
,
2
*
this
->
OC
*
sizeof
(
float
));
memcpy
(
weight_h_data
,
src1_iter
,
2
*
this
->
OC
*
sizeof
(
U
));
memcpy
(
weight_h_data
+
2
*
this
->
OC
,
src2_iter
,
memcpy
(
weight_h_data
+
2
*
this
->
OC
,
src2_iter
,
this
->
OC
*
sizeof
(
U
));
this
->
OC
*
sizeof
(
float
));
src1_iter
+=
2
*
this
->
OC
;
src1_iter
+=
2
*
this
->
OC
;
src2_iter
+=
this
->
OC
;
src2_iter
+=
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
}
}
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
if
(
origin_mode
==
false
)
{
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
weight_h_data
[
j
]
*=
-
1
;
U
minus_one
(
-
1.0
f
);
weight_h_data
[
j
]
=
minus_one
*
weight_h_data
[
j
];
}
}
weight_h_data
+=
3
*
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
}
}
...
@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
auto
input_memory_p
=
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
auto
weight_x_memory_p
=
std
::
shared_ptr
<
dnnl
::
memory
>
h0_memory_p
,
weight_h_memory_p
,
handler
.
AcquireWeightXMemory
(
weight_x
,
origin_mode
);
weight_x_memory_p
;
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
,
origin_mode
);
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_FP32
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
float
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
float
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
float
>(
weight_h
,
origin_mode
);
}
else
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_BF16
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
paddle
::
platform
::
bfloat16
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
paddle
::
platform
::
bfloat16
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
paddle
::
platform
::
bfloat16
>(
weight_h
,
origin_mode
);
}
else
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
uint8_t
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
int8_t
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
int8_t
>(
weight_h
,
origin_mode
);
}
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
,
origin_mode
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
,
origin_mode
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
...
...
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
浏览文件 @
5b4f8aac
...
@@ -81,7 +81,10 @@ class LSTMMKLDNNHandler
...
@@ -81,7 +81,10 @@ class LSTMMKLDNNHandler
MKLDNNMemoryFormat
::
tnc
);
MKLDNNMemoryFormat
::
tnc
);
auto
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
auto
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
ldnc
);
MKLDNNMemoryFormat
::
ldnc
);
auto
c0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
auto
c0_md
=
MKLDNNMemDesc
(
{
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
// Vanilla LSTM and LSTM
// with peepoles has c0 as
// fp32
MKLDNNMemoryFormat
::
ldnc
);
MKLDNNMemoryFormat
::
ldnc
);
// Create LSTM oneDNN primitive
// Create LSTM oneDNN primitive
...
@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler
...
@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler
// needed
// needed
// PaddlePaddle: {c, i, f, o}
// PaddlePaddle: {c, i, f, o}
// oneDNN: {i, f, c, o}
// oneDNN: {i, f, c, o}
void
ReorderGates
(
float
*
weights
,
int64_t
I
)
{
template
<
typename
U
>
void
ReorderGates
(
U
*
weights
,
int64_t
I
)
{
size_t
inner_block_size
=
this
->
OC
;
size_t
inner_block_size
=
this
->
OC
;
size_t
block_size
=
inner_block_size
*
this
->
G
;
size_t
block_size
=
inner_block_size
*
this
->
G
;
for
(
size_t
i
=
0
;
i
<
(
size_t
)
I
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
size_t
)
I
;
++
i
)
{
size_t
offset
=
i
*
block_size
;
size_t
offset
=
i
*
block_size
;
float
*
base_pos
=
weights
+
offset
;
U
*
base_pos
=
weights
+
offset
;
std
::
swap_ranges
(
base_pos
,
base_pos
+
inner_block_size
,
std
::
swap_ranges
(
base_pos
,
base_pos
+
inner_block_size
,
base_pos
+
inner_block_size
);
// c <-> i
base_pos
+
inner_block_size
);
// c <-> i
std
::
swap_ranges
(
base_pos
+
inner_block_size
,
std
::
swap_ranges
(
base_pos
+
inner_block_size
,
...
@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler
...
@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler
}
}
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler
...
@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
auto
*
weight_x_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
U
>
(),
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_x_data
,
this
->
IC
);
ReorderGates
(
weight_x_data
,
this
->
IC
);
...
@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler
...
@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler
return
memory_p
;
return
memory_p
;
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler
...
@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_h_data
=
auto
*
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_h_data
,
weight_h
->
data
<
U
>
(),
memcpy
(
weight_h_data
,
weight_h
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
OC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
OC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_h_data
,
this
->
OC
);
ReorderGates
(
weight_h_data
,
this
->
OC
);
...
@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler
...
@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler
memset
(
user_c0_memory
.
get_data_handle
(),
0
,
memset
(
user_c0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
this
->
N
*
this
->
OC
);
sizeof
(
float
)
*
this
->
N
*
this
->
OC
);
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
engine_
);
this
->
fwd_pd_
->
src_iter_c_desc
(),
this
->
engine_
);
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
dnnl
::
reorder
(
user_c0_memory
,
*
memory_p
,
this
->
attr_
)
dnnl
::
reorder
(
user_c0_memory
,
*
memory_p
,
this
->
attr_
)
...
@@ -275,7 +279,15 @@ template <typename T>
...
@@ -275,7 +279,15 @@ template <typename T>
class
FusionLSTMMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FusionLSTMMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
bool
is_bf16
=
std
::
is_same
<
T
,
paddle
::
platform
::
bfloat16
>::
value
;
const
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
// BF16 does not support force output
if
(
!
is_bf16
&&
force_fp32_output
)
{
RunKernel
<
float
>
(
ctx
);
RunKernel
<
float
>
(
ctx
);
}
else
{
RunKernel
<
T
>
(
ctx
);
}
}
}
template
<
typename
Tout
=
T
>
template
<
typename
Tout
=
T
>
...
@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
auto
input_memory_p
=
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
auto
c0_memory_p
=
handler
.
AcquireC0Memory
(
c0
);
auto
c0_memory_p
=
handler
.
AcquireC0Memory
(
c0
);
auto
weight_x_memory_p
=
handler
.
AcquireWeightXMemory
(
weight_x
);
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
);
std
::
shared_ptr
<
dnnl
::
memory
>
h0_memory_p
,
weight_h_memory_p
,
weight_x_memory_p
;
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_FP32
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
float
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
float
>(
weight_x
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
float
>(
weight_h
);
}
else
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_BF16
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
paddle
::
platform
::
bfloat16
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
paddle
::
platform
::
bfloat16
>(
weight_x
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
paddle
::
platform
::
bfloat16
>(
weight_h
);
}
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
...
@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
fusion_lstm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
REGISTER_OP_KERNEL
(
fusion_lstm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
FusionLSTMMKLDNNKernel
<
float
>
);
ops
::
FusionLSTMMKLDNNKernel
<
float
>
,
ops
::
FusionLSTMMKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
);
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
浏览文件 @
5b4f8aac
...
@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
...
@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
// TODO(grygielski) H0 is for now persistable
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
// not support in yet)
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
...
@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_h0_memory
=
dnnl
::
memory
();
auto
user_h0_memory
=
dnnl
::
memory
();
if
(
h0
)
{
if
(
h0
)
{
user_h0_memory
=
user_h0_memory
=
dnnl
::
memory
(
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
{{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldnc
},
MKLDNNGetDataType
<
float
>
(),
this
->
engine_
,
to_void_cast
(
h0
->
data
<
U
>
()));
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
,
to_void_cast
(
h0
->
data
<
float
>
()));
}
else
{
}
else
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
user_h0_memory
=
dnnl
::
memory
(
MKLDNNGetDataType
<
float
>
(),
{{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldnc
},
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
);
this
->
engine_
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
N
*
OC
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
U
)
*
N
*
OC
);
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
this
->
engine_
);
this
->
engine_
);
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
浏览文件 @
5b4f8aac
...
@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
mkldnn_data_type
=
False
self
.
mkldnn_data_type
=
False
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
(
check_dygraph
=
False
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"fusion_gru"
self
.
op_type
=
"fusion_gru"
self
.
lod
=
[[
2
,
4
,
3
]]
self
.
lod
=
[[
2
,
4
,
3
]]
...
@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
self
.
origin_mode
=
False
self
.
origin_mode
=
False
self
.
use_mkldnn
=
True
self
.
use_mkldnn
=
True
self
.
force_fp32_output
=
False
self
.
force_fp32_output
=
False
self
.
weights_dtype
=
'fp32'
self
.
set_confs
()
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
...
@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
wx_fp32
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float32'
)
wx_fp32
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float32'
)
wh_fp32
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float32'
)
wh_fp32
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float32'
)
wx_bf16
=
convert_float_to_uint16
(
wx_fp32
)
wh_bf16
=
convert_float_to_uint16
(
wh_fp32
)
# bias is fp32 despite other inputs being in bf16
# bias is fp32 despite other inputs being in bf16
bias
=
np
.
random
.
rand
(
bias
=
np
.
random
.
rand
(
1
,
3
*
self
.
D
).
astype
(
'float32'
)
if
self
.
with_bias
else
np
.
zeros
(
1
,
3
*
self
.
D
).
astype
(
'float32'
)
if
self
.
with_bias
else
np
.
zeros
(
...
@@ -74,6 +83,13 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -74,6 +83,13 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx_bf16
,
'WeightH'
:
wh_bf16
}
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx_fp32
,
'WeightX'
:
wx_fp32
,
...
@@ -84,10 +100,13 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -84,10 +100,13 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
self
.
inputs
[
'Bias'
]
=
bias
self
.
inputs
[
'Bias'
]
=
bias
if
self
.
with_h0
:
if
self
.
with_h0
:
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
[
'H0'
]
=
h0_bf16
self
.
inputs
[
'H0'
]
=
h0_bf16
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
[
'H0'
]
=
h0_fp32
h0_bf16
=
convert_float_to_uint16
(
h0_fp32
)
h0_bf16
=
convert_float_to_uint16
(
h0_fp32
)
self
.
outputs
=
{
'Hidden'
:
(
hidden
_bf16
,
self
.
lod
)}
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
)}
self
.
attrs
=
{
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'activation'
:
self
.
act_state
,
...
@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
...
@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
self
.
with_bias
=
False
self
.
with_bias
=
False
class
TestFusionGRUINT8MKLDNNBF16WeightsOp
(
TestFusionGRUBF16MKLDNNOp
):
def
set_confs
(
self
):
self
.
weights_dtype
=
'bf16'
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
from
paddle
import
enable_static
enable_static
()
enable_static
()
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
浏览文件 @
5b4f8aac
...
@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
...
@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
0 → 100644
浏览文件 @
5b4f8aac
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
struct
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
convert_float_to_uint16
,
convert_uint16_to_float
from
paddle.fluid.tests.unittests.test_fusion_lstm_op
import
TestFusionLSTMOp
,
fc
,
ACTIVATION
,
fusion_lstm
from
paddle.fluid.tests.unittests.test_fusion_gru_op
import
fusion_gru
@
unittest
.
skipIf
(
not
core
.
supports_bfloat16
(),
"place does not support BF16 evaluation"
)
class
TestFusionLSTMBF16ONEDNNOp
(
OpTest
):
def
set_confs
(
self
):
self
.
mkldnn_data_type
=
False
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
(
check_dygraph
=
False
,
no_check_set
=
[
"Cell"
])
def
setUp
(
self
):
self
.
op_type
=
'fusion_lstm'
self
.
lod
=
[[
2
,
3
,
5
,
4
]]
self
.
M
=
8
self
.
D
=
16
self
.
has_initial_state
=
False
self
.
use_peepholes
=
False
self
.
is_reverse
=
False
self
.
_cpu_only
=
True
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
use_mkldnn
=
True
self
.
force_fp32_output
=
False
self
.
weights_dtype
=
'fp32'
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
# fp32 X input for reference implementation and
# corressponding bf16 data as input to LSTM oneDNN bf16 kernel
x
=
np
.
random
.
normal
(
size
=
(
T
,
self
.
M
)).
astype
(
'float32'
)
x_bf16
=
convert_float_to_uint16
(
x
)
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
else
:
h0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
wh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
4
*
self
.
D
)).
astype
(
'float32'
)
h0_bf16
=
convert_float_to_uint16
(
h0
)
if
self
.
use_peepholes
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
self
.
D
)).
astype
(
'float32'
)
else
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
w_b
=
np
.
copy
(
b
[:,
0
:
4
*
self
.
D
])
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
wx
=
np
.
random
.
normal
(
size
=
(
self
.
M
,
4
*
self
.
D
)).
astype
(
'float32'
)
wx_bf16
=
convert_float_to_uint16
(
wx
)
wh_bf16
=
convert_float_to_uint16
(
wh
)
bx
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
b
[
0
,
0
:
4
*
self
.
D
]
+=
bx
[
0
,
:]
hidden
,
c
=
fusion_lstm
(
x
,
self
.
lod
,
wx
,
bx
,
h0
,
c0
,
wh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
])
hidden
=
hidden
.
astype
(
'float32'
)
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx_bf16
,
'WeightH'
:
wh_bf16
,
'Bias'
:
b
}
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx
,
'WeightH'
:
wh
,
'Bias'
:
b
}
if
self
.
has_initial_state
:
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
[
'H0'
]
=
h0_bf16
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
[
'H0'
]
=
h0
self
.
inputs
[
'C0'
]
=
c0
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
),
'Cell'
:
(
c
,
self
.
lod
),
}
self
.
attrs
=
{
'use_peepholes'
:
self
.
use_peepholes
,
'is_reverse'
:
self
.
is_reverse
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
,
'force_fp32_output'
:
self
.
force_fp32_output
,
'use_mkldnn'
:
self
.
use_mkldnn
}
class
TestFusionLSTMBF16ONEDNNPeepholesOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
use_peepholes
=
True
class
TestFusionLSTMBF16ONEDNNInitializedStateOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
has_initial_state
=
True
class
TestFusionLSTMBF16ONEDNNReverseOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
class
TestFusionLSTMBF16ONEDNNBF16WeightsOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
weights_dtype
=
'bf16'
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
5b4f8aac
...
@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
...
@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
return
new_output
return
new_output
def
copy_bits_from_uint16_to_float
(
i
):
i
=
np
.
uint32
(
i
)
<<
16
return
struct
.
unpack
(
'<f'
,
struct
.
pack
(
'<I'
,
i
))[
0
]
def
convert_uint16_to_float
(
uint16_list
):
new_output
=
[]
for
x
in
np
.
nditer
(
uint16_list
):
new_output
.
append
(
np
.
float32
(
copy_bits_from_uint16_to_float
(
x
)))
return
np
.
reshape
(
new_output
,
uint16_list
.
shape
).
view
(
np
.
float32
)
class
OpTest
(
unittest
.
TestCase
):
class
OpTest
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase):
...
@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase):
idx
=
find_actual
(
out_name
,
fetch_list
)
idx
=
find_actual
(
out_name
,
fetch_list
)
actual
=
outs
[
idx
]
actual
=
outs
[
idx
]
actual_t
=
np
.
array
(
actual
)
actual_t
=
np
.
array
(
actual
)
expect
=
self
.
outputs
[
out_name
]
expect
=
self
.
outputs
[
out_name
]
expect_t
=
expect
[
0
]
if
isinstance
(
expect
,
tuple
)
else
expect
expect_t
=
expect
[
0
]
if
isinstance
(
expect
,
tuple
)
else
expect
if
actual_t
.
dtype
==
np
.
uint16
and
expect_t
.
dtype
==
np
.
float32
:
actual_t
=
convert_uint16_to_float
(
actual_t
)
atol
=
0.03
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
...
...
tools/static_mode_white_list.py
浏览文件 @
5b4f8aac
...
@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [
...
@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [
'test_nearest_interp_mkldnn_op'
,
'test_nearest_interp_mkldnn_op'
,
'test_bilinear_interp_mkldnn_op'
,
'test_bilinear_interp_mkldnn_op'
,
'test_fusion_gru_int8_mkldnn_op'
,
'test_fusion_gru_int8_mkldnn_op'
,
'test_fusion_gru_bf16_mkldnn_op'
,
'test_fusion_gru_mkldnn_op'
,
'test_fusion_gru_mkldnn_op'
,
'test_fusion_lstm_mkldnn_op'
,
'test_fusion_lstm_mkldnn_op'
,
'test_fusion_lstm_bf16_mkldnn_op'
,
'test_gaussian_random_mkldnn_op'
,
'test_gaussian_random_mkldnn_op'
,
'test_lrn_mkldnn_op'
,
'test_lrn_mkldnn_op'
,
'test_matmul_mkldnn_op'
,
'test_matmul_mkldnn_op'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录