Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5b4f8aac
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5b4f8aac
编写于
3月 04, 2021
作者:
J
jakpiase
提交者:
GitHub
3月 04, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Added LSTM BF16 and fixed GRU BF16 (#31234)
上级
7cdf6ea7
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
322 addition
and
58 deletion
+322
-58
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+4
-0
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+44
-20
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+53
-21
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+8
-10
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
.../tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+31
-7
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
.../tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+2
-0
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
...tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+159
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+19
-0
tools/static_mode_white_list.py
tools/static_mode_white_list.py
+2
-0
未找到文件。
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
5b4f8aac
...
@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() {
...
@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() {
AddAttr
<
bool
>
(
"use_mkldnn"
,
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"force_fp32_output"
,
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator.
Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op.
This operator fuse the X into LSTM, more details can refer to LSTM op.
...
...
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
浏览文件 @
5b4f8aac
...
@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
}
}
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
const
bool
origin_mode
)
{
const
bool
origin_mode
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
...
@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
auto
*
weight_x_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
U
>
(),
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
if
(
origin_mode
==
false
)
{
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
IC
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
IC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
weight_x_data
[
j
]
*=
-
1
;
U
minus_one
(
-
1.0
f
);
weight_x_data
[
j
]
=
minus_one
*
weight_x_data
[
j
];
}
}
weight_x_data
+=
3
*
this
->
OC
;
weight_x_data
+=
3
*
this
->
OC
;
}
}
...
@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
return
memory_p
;
return
memory_p
;
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
const
bool
origin_mode
)
{
const
bool
origin_mode
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
...
@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
...
@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// oneDNN format [OC, 3OC]
// oneDNN format [OC, 3OC]
auto
*
weight_h_data
=
auto
*
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
auto
*
user_weight_h_data
=
weight_h
->
data
<
U
>
();
auto
*
user_weight_h_data
=
weight_h
->
data
<
float
>
();
auto
src1_iter
=
user_weight_h_data
;
auto
src1_iter
=
user_weight_h_data
;
auto
src2_iter
=
user_weight_h_data
+
2
*
this
->
OC
*
this
->
OC
;
auto
src2_iter
=
user_weight_h_data
+
2
*
this
->
OC
*
this
->
OC
;
for
(
int64_t
c
=
0
;
c
<
this
->
OC
;
++
c
)
{
for
(
int64_t
c
=
0
;
c
<
this
->
OC
;
++
c
)
{
memcpy
(
weight_h_data
,
src1_iter
,
2
*
this
->
OC
*
sizeof
(
float
));
memcpy
(
weight_h_data
,
src1_iter
,
2
*
this
->
OC
*
sizeof
(
U
));
memcpy
(
weight_h_data
+
2
*
this
->
OC
,
src2_iter
,
memcpy
(
weight_h_data
+
2
*
this
->
OC
,
src2_iter
,
this
->
OC
*
sizeof
(
U
));
this
->
OC
*
sizeof
(
float
));
src1_iter
+=
2
*
this
->
OC
;
src1_iter
+=
2
*
this
->
OC
;
src2_iter
+=
this
->
OC
;
src2_iter
+=
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
}
}
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
if
(
origin_mode
==
false
)
{
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
weight_h_data
[
j
]
*=
-
1
;
U
minus_one
(
-
1.0
f
);
weight_h_data
[
j
]
=
minus_one
*
weight_h_data
[
j
];
}
}
weight_h_data
+=
3
*
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
}
}
...
@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
auto
input_memory_p
=
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
auto
weight_x_memory_p
=
std
::
shared_ptr
<
dnnl
::
memory
>
h0_memory_p
,
weight_h_memory_p
,
handler
.
AcquireWeightXMemory
(
weight_x
,
origin_mode
);
weight_x_memory_p
;
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
,
origin_mode
);
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_FP32
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
float
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
float
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
float
>(
weight_h
,
origin_mode
);
}
else
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_BF16
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
paddle
::
platform
::
bfloat16
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
paddle
::
platform
::
bfloat16
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
paddle
::
platform
::
bfloat16
>(
weight_h
,
origin_mode
);
}
else
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
uint8_t
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
int8_t
>(
weight_x
,
origin_mode
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
int8_t
>(
weight_h
,
origin_mode
);
}
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
,
origin_mode
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
,
origin_mode
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
...
...
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
浏览文件 @
5b4f8aac
...
@@ -81,8 +81,11 @@ class LSTMMKLDNNHandler
...
@@ -81,8 +81,11 @@ class LSTMMKLDNNHandler
MKLDNNMemoryFormat
::
tnc
);
MKLDNNMemoryFormat
::
tnc
);
auto
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
auto
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
ldnc
);
MKLDNNMemoryFormat
::
ldnc
);
auto
c0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
auto
c0_md
=
MKLDNNMemDesc
(
MKLDNNMemoryFormat
::
ldnc
);
{
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
// Vanilla LSTM and LSTM
// with peepoles has c0 as
// fp32
MKLDNNMemoryFormat
::
ldnc
);
// Create LSTM oneDNN primitive
// Create LSTM oneDNN primitive
const
auto
direction
=
const
auto
direction
=
...
@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler
...
@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler
// needed
// needed
// PaddlePaddle: {c, i, f, o}
// PaddlePaddle: {c, i, f, o}
// oneDNN: {i, f, c, o}
// oneDNN: {i, f, c, o}
void
ReorderGates
(
float
*
weights
,
int64_t
I
)
{
template
<
typename
U
>
void
ReorderGates
(
U
*
weights
,
int64_t
I
)
{
size_t
inner_block_size
=
this
->
OC
;
size_t
inner_block_size
=
this
->
OC
;
size_t
block_size
=
inner_block_size
*
this
->
G
;
size_t
block_size
=
inner_block_size
*
this
->
G
;
for
(
size_t
i
=
0
;
i
<
(
size_t
)
I
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
size_t
)
I
;
++
i
)
{
size_t
offset
=
i
*
block_size
;
size_t
offset
=
i
*
block_size
;
float
*
base_pos
=
weights
+
offset
;
U
*
base_pos
=
weights
+
offset
;
std
::
swap_ranges
(
base_pos
,
base_pos
+
inner_block_size
,
std
::
swap_ranges
(
base_pos
,
base_pos
+
inner_block_size
,
base_pos
+
inner_block_size
);
// c <-> i
base_pos
+
inner_block_size
);
// c <-> i
std
::
swap_ranges
(
base_pos
+
inner_block_size
,
std
::
swap_ranges
(
base_pos
+
inner_block_size
,
...
@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler
...
@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler
}
}
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler
...
@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
auto
*
weight_x_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
U
>
(),
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_x_data
,
this
->
IC
);
ReorderGates
(
weight_x_data
,
this
->
IC
);
...
@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler
...
@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler
return
memory_p
;
return
memory_p
;
}
}
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler
...
@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_md
=
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_h_data
=
auto
*
weight_h_data
=
reinterpret_cast
<
U
*>
(
user_memory
.
get_data_handle
());
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_h_data
,
weight_h
->
data
<
U
>
(),
memcpy
(
weight_h_data
,
weight_h
->
data
<
float
>
(),
sizeof
(
U
)
*
this
->
OC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
this
->
OC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_h_data
,
this
->
OC
);
ReorderGates
(
weight_h_data
,
this
->
OC
);
...
@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler
...
@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler
memset
(
user_c0_memory
.
get_data_handle
(),
0
,
memset
(
user_c0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
this
->
N
*
this
->
OC
);
sizeof
(
float
)
*
this
->
N
*
this
->
OC
);
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
engine_
);
this
->
fwd_pd_
->
src_iter_c_desc
(),
this
->
engine_
);
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
dnnl
::
reorder
(
user_c0_memory
,
*
memory_p
,
this
->
attr_
)
dnnl
::
reorder
(
user_c0_memory
,
*
memory_p
,
this
->
attr_
)
...
@@ -275,7 +279,15 @@ template <typename T>
...
@@ -275,7 +279,15 @@ template <typename T>
class
FusionLSTMMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FusionLSTMMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
RunKernel
<
float
>
(
ctx
);
const
bool
is_bf16
=
std
::
is_same
<
T
,
paddle
::
platform
::
bfloat16
>::
value
;
const
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
// BF16 does not support force output
if
(
!
is_bf16
&&
force_fp32_output
)
{
RunKernel
<
float
>
(
ctx
);
}
else
{
RunKernel
<
T
>
(
ctx
);
}
}
}
template
<
typename
Tout
=
T
>
template
<
typename
Tout
=
T
>
...
@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
auto
input_memory_p
=
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
auto
c0_memory_p
=
handler
.
AcquireC0Memory
(
c0
);
auto
c0_memory_p
=
handler
.
AcquireC0Memory
(
c0
);
auto
weight_x_memory_p
=
handler
.
AcquireWeightXMemory
(
weight_x
);
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
);
std
::
shared_ptr
<
dnnl
::
memory
>
h0_memory_p
,
weight_h_memory_p
,
weight_x_memory_p
;
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_FP32
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
float
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
float
>(
weight_x
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
float
>(
weight_h
);
}
else
if
(
weight_h
->
type
()
==
paddle
::
framework
::
proto
::
VarType_Type_BF16
)
{
h0_memory_p
=
handler
.
template
AcquireH0Memory
<
paddle
::
platform
::
bfloat16
>(
h0
);
weight_x_memory_p
=
handler
.
template
AcquireWeightXMemory
<
paddle
::
platform
::
bfloat16
>(
weight_x
);
weight_h_memory_p
=
handler
.
template
AcquireWeightHMemory
<
paddle
::
platform
::
bfloat16
>(
weight_h
);
}
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
...
@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
fusion_lstm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
REGISTER_OP_KERNEL
(
fusion_lstm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
FusionLSTMMKLDNNKernel
<
float
>
);
ops
::
FusionLSTMMKLDNNKernel
<
float
>
,
ops
::
FusionLSTMMKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
);
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
浏览文件 @
5b4f8aac
...
@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
...
@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
// TODO(grygielski) H0 is for now persistable
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
// not support in yet)
template
<
typename
U
>
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
auto
memory_p
=
auto
memory_p
=
...
@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
...
@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
if
(
!
memory_p
)
{
if
(
!
memory_p
)
{
auto
user_h0_memory
=
dnnl
::
memory
();
auto
user_h0_memory
=
dnnl
::
memory
();
if
(
h0
)
{
if
(
h0
)
{
user_h0_memory
=
user_h0_memory
=
dnnl
::
memory
(
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
{{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldnc
},
MKLDNNGetDataType
<
float
>
(),
this
->
engine_
,
to_void_cast
(
h0
->
data
<
U
>
()));
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
,
to_void_cast
(
h0
->
data
<
float
>
()));
}
else
{
}
else
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
user_h0_memory
=
dnnl
::
memory
(
MKLDNNGetDataType
<
float
>
(),
{{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
U
>
(),
MKLDNNMemoryFormat
::
ldnc
},
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
);
this
->
engine_
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
U
)
*
N
*
OC
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
N
*
OC
);
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
this
->
engine_
);
this
->
engine_
);
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
浏览文件 @
5b4f8aac
...
@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
mkldnn_data_type
=
False
self
.
mkldnn_data_type
=
False
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
(
check_dygraph
=
False
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"fusion_gru"
self
.
op_type
=
"fusion_gru"
self
.
lod
=
[[
2
,
4
,
3
]]
self
.
lod
=
[[
2
,
4
,
3
]]
...
@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
self
.
origin_mode
=
False
self
.
origin_mode
=
False
self
.
use_mkldnn
=
True
self
.
use_mkldnn
=
True
self
.
force_fp32_output
=
False
self
.
force_fp32_output
=
False
self
.
weights_dtype
=
'fp32'
self
.
set_confs
()
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
...
@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
wx_fp32
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float32'
)
wx_fp32
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float32'
)
wh_fp32
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float32'
)
wh_fp32
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float32'
)
wx_bf16
=
convert_float_to_uint16
(
wx_fp32
)
wh_bf16
=
convert_float_to_uint16
(
wh_fp32
)
# bias is fp32 despite other inputs being in bf16
# bias is fp32 despite other inputs being in bf16
bias
=
np
.
random
.
rand
(
bias
=
np
.
random
.
rand
(
1
,
3
*
self
.
D
).
astype
(
'float32'
)
if
self
.
with_bias
else
np
.
zeros
(
1
,
3
*
self
.
D
).
astype
(
'float32'
)
if
self
.
with_bias
else
np
.
zeros
(
...
@@ -74,20 +83,30 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
...
@@ -74,20 +83,30 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
self
.
inputs
=
{
if
self
.
weights_dtype
==
'bf16'
:
'X'
:
(
x_bf16
,
self
.
lod
),
self
.
inputs
=
{
'WeightX'
:
wx_fp32
,
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightH'
:
wh_fp32
'WeightX'
:
wx_bf16
,
}
'WeightH'
:
wh_bf16
}
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx_fp32
,
'WeightH'
:
wh_fp32
}
if
self
.
with_bias
:
if
self
.
with_bias
:
self
.
inputs
[
'Bias'
]
=
bias
self
.
inputs
[
'Bias'
]
=
bias
if
self
.
with_h0
:
if
self
.
with_h0
:
self
.
inputs
[
'H0'
]
=
h0_bf16
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
[
'H0'
]
=
h0_bf16
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
[
'H0'
]
=
h0_fp32
h0_bf16
=
convert_float_to_uint16
(
h0_fp32
)
h0_bf16
=
convert_float_to_uint16
(
h0_fp32
)
self
.
outputs
=
{
'Hidden'
:
(
hidden
_bf16
,
self
.
lod
)}
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
)}
self
.
attrs
=
{
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'activation'
:
self
.
act_state
,
...
@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
...
@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
self
.
with_bias
=
False
self
.
with_bias
=
False
class
TestFusionGRUINT8MKLDNNBF16WeightsOp
(
TestFusionGRUBF16MKLDNNOp
):
def
set_confs
(
self
):
self
.
weights_dtype
=
'bf16'
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
from
paddle
import
enable_static
enable_static
()
enable_static
()
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
浏览文件 @
5b4f8aac
...
@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
...
@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
0 → 100644
浏览文件 @
5b4f8aac
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
struct
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
convert_float_to_uint16
,
convert_uint16_to_float
from
paddle.fluid.tests.unittests.test_fusion_lstm_op
import
TestFusionLSTMOp
,
fc
,
ACTIVATION
,
fusion_lstm
from
paddle.fluid.tests.unittests.test_fusion_gru_op
import
fusion_gru
@
unittest
.
skipIf
(
not
core
.
supports_bfloat16
(),
"place does not support BF16 evaluation"
)
class
TestFusionLSTMBF16ONEDNNOp
(
OpTest
):
def
set_confs
(
self
):
self
.
mkldnn_data_type
=
False
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
(
check_dygraph
=
False
,
no_check_set
=
[
"Cell"
])
def
setUp
(
self
):
self
.
op_type
=
'fusion_lstm'
self
.
lod
=
[[
2
,
3
,
5
,
4
]]
self
.
M
=
8
self
.
D
=
16
self
.
has_initial_state
=
False
self
.
use_peepholes
=
False
self
.
is_reverse
=
False
self
.
_cpu_only
=
True
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
use_mkldnn
=
True
self
.
force_fp32_output
=
False
self
.
weights_dtype
=
'fp32'
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
# fp32 X input for reference implementation and
# corressponding bf16 data as input to LSTM oneDNN bf16 kernel
x
=
np
.
random
.
normal
(
size
=
(
T
,
self
.
M
)).
astype
(
'float32'
)
x_bf16
=
convert_float_to_uint16
(
x
)
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
else
:
h0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
wh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
4
*
self
.
D
)).
astype
(
'float32'
)
h0_bf16
=
convert_float_to_uint16
(
h0
)
if
self
.
use_peepholes
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
self
.
D
)).
astype
(
'float32'
)
else
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
w_b
=
np
.
copy
(
b
[:,
0
:
4
*
self
.
D
])
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
wx
=
np
.
random
.
normal
(
size
=
(
self
.
M
,
4
*
self
.
D
)).
astype
(
'float32'
)
wx_bf16
=
convert_float_to_uint16
(
wx
)
wh_bf16
=
convert_float_to_uint16
(
wh
)
bx
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
b
[
0
,
0
:
4
*
self
.
D
]
+=
bx
[
0
,
:]
hidden
,
c
=
fusion_lstm
(
x
,
self
.
lod
,
wx
,
bx
,
h0
,
c0
,
wh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
])
hidden
=
hidden
.
astype
(
'float32'
)
hidden_bf16
=
convert_float_to_uint16
(
hidden
)
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx_bf16
,
'WeightH'
:
wh_bf16
,
'Bias'
:
b
}
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
=
{
'X'
:
(
x_bf16
,
self
.
lod
),
'WeightX'
:
wx
,
'WeightH'
:
wh
,
'Bias'
:
b
}
if
self
.
has_initial_state
:
if
self
.
weights_dtype
==
'bf16'
:
self
.
inputs
[
'H0'
]
=
h0_bf16
elif
self
.
weights_dtype
==
'fp32'
:
self
.
inputs
[
'H0'
]
=
h0
self
.
inputs
[
'C0'
]
=
c0
self
.
outputs
=
{
'Hidden'
:
(
hidden
,
self
.
lod
),
'Cell'
:
(
c
,
self
.
lod
),
}
self
.
attrs
=
{
'use_peepholes'
:
self
.
use_peepholes
,
'is_reverse'
:
self
.
is_reverse
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
,
'force_fp32_output'
:
self
.
force_fp32_output
,
'use_mkldnn'
:
self
.
use_mkldnn
}
class
TestFusionLSTMBF16ONEDNNPeepholesOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
use_peepholes
=
True
class
TestFusionLSTMBF16ONEDNNInitializedStateOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
has_initial_state
=
True
class
TestFusionLSTMBF16ONEDNNReverseOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
class
TestFusionLSTMBF16ONEDNNBF16WeightsOp
(
TestFusionLSTMBF16ONEDNNOp
):
def
set_confs
(
self
):
self
.
weights_dtype
=
'bf16'
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
5b4f8aac
...
@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
...
@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
return
new_output
return
new_output
def
copy_bits_from_uint16_to_float
(
i
):
i
=
np
.
uint32
(
i
)
<<
16
return
struct
.
unpack
(
'<f'
,
struct
.
pack
(
'<I'
,
i
))[
0
]
def
convert_uint16_to_float
(
uint16_list
):
new_output
=
[]
for
x
in
np
.
nditer
(
uint16_list
):
new_output
.
append
(
np
.
float32
(
copy_bits_from_uint16_to_float
(
x
)))
return
np
.
reshape
(
new_output
,
uint16_list
.
shape
).
view
(
np
.
float32
)
class
OpTest
(
unittest
.
TestCase
):
class
OpTest
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase):
...
@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase):
idx
=
find_actual
(
out_name
,
fetch_list
)
idx
=
find_actual
(
out_name
,
fetch_list
)
actual
=
outs
[
idx
]
actual
=
outs
[
idx
]
actual_t
=
np
.
array
(
actual
)
actual_t
=
np
.
array
(
actual
)
expect
=
self
.
outputs
[
out_name
]
expect
=
self
.
outputs
[
out_name
]
expect_t
=
expect
[
0
]
if
isinstance
(
expect
,
tuple
)
else
expect
expect_t
=
expect
[
0
]
if
isinstance
(
expect
,
tuple
)
else
expect
if
actual_t
.
dtype
==
np
.
uint16
and
expect_t
.
dtype
==
np
.
float32
:
actual_t
=
convert_uint16_to_float
(
actual_t
)
atol
=
0.03
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
...
...
tools/static_mode_white_list.py
浏览文件 @
5b4f8aac
...
@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [
...
@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [
'test_nearest_interp_mkldnn_op'
,
'test_nearest_interp_mkldnn_op'
,
'test_bilinear_interp_mkldnn_op'
,
'test_bilinear_interp_mkldnn_op'
,
'test_fusion_gru_int8_mkldnn_op'
,
'test_fusion_gru_int8_mkldnn_op'
,
'test_fusion_gru_bf16_mkldnn_op'
,
'test_fusion_gru_mkldnn_op'
,
'test_fusion_gru_mkldnn_op'
,
'test_fusion_lstm_mkldnn_op'
,
'test_fusion_lstm_mkldnn_op'
,
'test_fusion_lstm_bf16_mkldnn_op'
,
'test_gaussian_random_mkldnn_op'
,
'test_gaussian_random_mkldnn_op'
,
'test_lrn_mkldnn_op'
,
'test_lrn_mkldnn_op'
,
'test_matmul_mkldnn_op'
,
'test_matmul_mkldnn_op'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录