Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
824a79d3
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
824a79d3
编写于
1月 26, 2021
作者:
T
Tao Luo
提交者:
GitHub
1月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Revert "Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30661)" (#30708)
This reverts commit
d834f4e6
.
上级
7fbc68a2
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
218 addition
and
756 deletion
+218
-756
cmake/operators.cmake
cmake/operators.cmake
+1
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+1
-5
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+1
-16
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+214
-36
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+0
-377
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+0
-229
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
+0
-2
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
...luid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
+0
-81
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+0
-2
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+1
-5
python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
...uid/tests/unittests/white_list/no_check_set_white_list.py
+0
-1
tools/static_mode_white_list.py
tools/static_mode_white_list.py
+0
-1
未找到文件。
cmake/operators.cmake
浏览文件 @
824a79d3
...
...
@@ -197,7 +197,7 @@ function(op_library TARGET)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
"fusion_conv_inception_op"
"sync_batch_norm_op"
"dgc_op"
"fused_fc_elementwise_layernorm_op"
"skip_layernorm_op"
"multihead_matmul_op"
"fusion_group_op"
"fused_bn_activation_op"
"fused_embedding_eltwise_layernorm_op"
"fusion_gru_op"
"fusion_lstm_op"
"skip_layernorm_op"
"multihead_matmul_op"
"fusion_group_op"
"fused_bn_activation_op"
"fused_embedding_eltwise_layernorm_op"
"fusion_gru_op"
"fused_bn_add_activation_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
824a79d3
...
...
@@ -14,15 +14,11 @@ register_operators(EXCLUDES
fused_embedding_eltwise_layernorm_op
fusion_group_op
fusion_gru_op
fusion_lstm_op
fused_bn_add_activation_op
)
# fusion_gru_op does not have CUDA kernel
op_library
(
fusion_gru_op
)
op_library
(
fusion_lstm_op
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(fusion_gru);
\n
USE_CPU_ONLY_OP(fusion_lstm);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(fusion_gru);
\n
"
)
if
(
WITH_GPU
)
# fused_bn_activation_op needs cudnn 7.4.1 above
...
...
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
824a79d3
...
...
@@ -18,9 +18,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -148,17 +145,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
FusionLSTMOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
this
->
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
(),
layout
,
library
);
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
device_context
());
}
void
FusionLSTMOpMaker
::
Make
()
{
...
...
@@ -247,9 +235,6 @@ void FusionLSTMOpMaker::Make() {
"`tanh` by default."
)
.
SetDefault
(
"tanh"
)
.
InEnum
({
"sigmoid"
,
"tanh"
,
"relu"
,
"identity"
});
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op.
...
...
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
浏览文件 @
824a79d3
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/
operators/fused/mkldnn/fusion_rnn_mkldnn
.h"
#include "paddle/fluid/
platform/mkldnn_reuse
.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -27,7 +27,7 @@ using paddle::platform::MKLDNNMemDesc;
using
platform
::
to_void_cast
;
template
<
typename
T
,
typename
T_out
=
T
>
class
GRUMKLDNNHandler
:
public
RNNMKLDNNHandler
<
T
,
dnnl
::
gru_forward
,
T_out
>
{
class
GRUMKLDNNHandler
:
public
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
gru_forward
>
{
public:
GRUMKLDNNHandler
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
...
...
@@ -37,12 +37,37 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
const
bool
is_reverse
,
const
int64_t
N
,
const
int64_t
Ti
,
const
int64_t
IC
,
const
int64_t
OC
,
const
std
::
string
&
unique_name
)
:
RNNMKLDNNHandler
<
T
,
dnnl
::
gru_forward
,
T_out
>
(
ctx
,
dev_ctx
,
mkldnn_engine
,
ctx
.
GetPlace
(),
input
,
weight_h
,
h0
,
is_reverse
,
N
,
Ti
,
IC
,
OC
,
3
,
ctx
.
InputName
(
"X"
)
+
ctx
.
InputName
(
"WeightH"
))
{
:
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
gru_forward
>
(
dev_ctx
,
dev_ctx
.
GetEngine
(),
cpu_place
,
CreateKey
(
dev_ctx
,
unique_name
,
MKLDNNGetDataType
<
T
>
(),
Ti
)),
N
(
N
),
Ti
(
Ti
),
IC
(
IC
),
OC
(
OC
)
{
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
memory_key_
=
platform
::
ExtendKeyWithThreadInfoIfNeeded
(
dev_ctx
,
CreateKey
(
dev_ctx
,
unique_name
,
MKLDNNGetDataType
<
T
>
()));
// Is it int8 kernel
const
bool
is_INT8
=
std
::
is_same
<
T
,
uint8_t
>::
value
;
if
(
is_INT8
)
{
// Int8 attributes
const
float
scale_data
=
ctx
.
Attr
<
float
>
(
"Scale_data"
);
const
float
shift_data
=
ctx
.
Attr
<
float
>
(
"Shift_data"
);
const
auto
scale_weights
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
const
int
weights_scale_mask
=
0
+
(
1
<<
3
)
// bit, indicating the unique scales for `g` dim in `ldigo`
+
(
1
<<
4
);
// bit, indicating the unique scales for `o` dim in `ldigo`
attr_
.
set_rnn_data_qparams
(
scale_data
,
shift_data
);
attr_
.
set_rnn_weights_qparams
(
weights_scale_mask
,
scale_weights
);
}
if
(
!
this
->
isCached
())
{
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ
(
...
...
@@ -83,35 +108,176 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
:
dnnl
::
rnn_direction
::
unidirectional_left2right
;
this
->
AcquireForwardPrimitiveDescriptor
(
this
->
attr_
,
dnnl
::
prop_kind
::
forward_inference
,
direction
,
input_md
,
h0_md
,
weight_x_md
,
weight_h_md
,
bias_md
,
hidden_md
,
dnnl
::
memory
::
desc
());
attr_
,
dnnl
::
prop_kind
::
forward_inference
,
direction
,
input_md
,
h0_md
,
weight_x_md
,
weight_h_md
,
bias_md
,
hidden_md
,
dnnl
::
memory
::
desc
());
}
}
bool
is_NTC
()
{
return
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
dst_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
);
}
void
reorderRNNdata
(
void
*
input_data
,
void
*
output_data
,
std
::
vector
<
size_t
>
lod
,
const
bool
is_reverse
,
platform
::
RNNReorderType
reorder_type
)
{
switch
(
reorder_type
)
{
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case
platform
::
RNNReorderType
::
PP_NTC
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
IC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
IC
-
num_elements
)
:
0
;
memcpy
(
output_data_iter
+
n
*
Ti
*
IC
+
offset
,
input_data_iter
,
sizeof
(
T
)
*
num_elements
);
input_data_iter
+=
num_elements
;
}
}
break
;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case
platform
::
RNNReorderType
::
PP_TNC
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
]);
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data_iter
+
(
t
+
offset
)
*
N
*
IC
+
n
*
IC
,
input_data_iter
,
sizeof
(
T
)
*
IC
);
input_data_iter
+=
IC
;
}
}
}
break
;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
NTC_PP
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T_out
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T_out
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
OC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
OC
-
num_elements
)
:
0
;
memcpy
(
output_data_iter
,
input_data_iter
+
n
*
Ti
*
OC
+
offset
,
sizeof
(
T_out
)
*
num_elements
);
output_data_iter
+=
num_elements
;
}
}
break
;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
TNC_PP
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T_out
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T_out
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
lod
[
n
+
1
]
-
lod
[
n
];
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data_iter
,
input_data_iter
+
(
t
+
offset
)
*
N
*
OC
+
n
*
OC
,
sizeof
(
T_out
)
*
OC
);
output_data_iter
+=
OC
;
}
}
}
break
;
}
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireInputMemoryWithReorder
(
const
LoDTensor
*
input
,
const
bool
is_reverse
)
{
const
auto
name
=
this
->
key_
+
"@input_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
const
auto
&
input_lod
=
input
->
lod
()[
0
];
auto
*
x_data
=
to_void_cast
(
input
->
data
<
T
>
());
auto
*
x_onednn_data
=
memory_p
->
get_data_handle
();
memset
(
x_onednn_data
,
0
,
sizeof
(
T
)
*
N
*
Ti
*
IC
);
if
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
src_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
)
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_NTC
);
}
else
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_TNC
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireOutputMemory
()
{
const
auto
name
=
this
->
key_
+
"@output_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
dst_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
return
memory_p
;
}
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
h0_key
));
if
(
!
memory_p
)
{
auto
user_h0_memory
=
dnnl
::
memory
();
if
(
h0
)
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
,
to_void_cast
(
h0
->
data
<
float
>
()));
}
else
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
N
*
OC
);
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
this
->
engine_
);
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
dnnl
::
reorder
(
user_h0_memory
,
*
memory_p
,
attr_
)
.
execute
(
astream
,
user_h0_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
h0_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
const
bool
origin_mode
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
const
std
::
string
wx_key
=
memory_key_
+
"@weight_x"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wx_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
}
,
MKLDNN
GetDataType
<
float
>
(),
MKLDNN
MemoryFormat
::
ldigo
);
MKLDNNMemDesc
({
1
,
1
,
IC
,
3
,
OC
},
MKLDNNGetDataType
<
float
>
()
,
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
sizeof
(
float
)
*
IC
*
3
*
OC
);
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
IC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
i
=
0
;
i
<
IC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
OC
;
++
j
)
{
weight_x_data
[
j
]
*=
-
1
;
}
weight_x_data
+=
3
*
this
->
OC
;
weight_x_data
+=
3
*
OC
;
}
}
...
...
@@ -119,7 +285,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
this
->
fwd_pd_
->
weights_layer_desc
(),
this
->
engine_
);
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
this
->
attr_
)
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
attr_
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wx_key
,
memory_p
);
...
...
@@ -129,14 +295,14 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
const
bool
origin_mode
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
const
std
::
string
wh_key
=
memory_key_
+
"@weight_h"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wh_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
}
,
MKLDNN
GetDataType
<
float
>
(),
MKLDNN
MemoryFormat
::
ldigo
);
MKLDNNMemDesc
({
1
,
1
,
OC
,
3
,
OC
},
MKLDNNGetDataType
<
float
>
()
,
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
...
...
@@ -146,26 +312,25 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
auto
*
user_weight_h_data
=
weight_h
->
data
<
float
>
();
auto
src1_iter
=
user_weight_h_data
;
auto
src2_iter
=
user_weight_h_data
+
2
*
this
->
OC
*
this
->
OC
;
auto
src2_iter
=
user_weight_h_data
+
2
*
OC
*
OC
;
for
(
int64_t
c
=
0
;
c
<
this
->
OC
;
++
c
)
{
memcpy
(
weight_h_data
,
src1_iter
,
2
*
this
->
OC
*
sizeof
(
float
));
memcpy
(
weight_h_data
+
2
*
this
->
OC
,
src2_iter
,
this
->
OC
*
sizeof
(
float
));
for
(
int64_t
c
=
0
;
c
<
OC
;
++
c
)
{
memcpy
(
weight_h_data
,
src1_iter
,
2
*
OC
*
sizeof
(
float
));
memcpy
(
weight_h_data
+
2
*
OC
,
src2_iter
,
OC
*
sizeof
(
float
));
src1_iter
+=
2
*
this
->
OC
;
src2_iter
+=
this
->
OC
;
weight_h_data
+=
3
*
this
->
OC
;
src1_iter
+=
2
*
OC
;
src2_iter
+=
OC
;
weight_h_data
+=
3
*
OC
;
}
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
this
->
OC
;
++
j
)
{
for
(
int64_t
i
=
0
;
i
<
OC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
OC
;
++
j
)
{
weight_h_data
[
j
]
*=
-
1
;
}
weight_h_data
+=
3
*
this
->
OC
;
weight_h_data
+=
3
*
OC
;
}
}
...
...
@@ -173,7 +338,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
this
->
fwd_pd_
->
weights_iter_desc
(),
this
->
engine_
);
auto
&
astream
=
paddle
::
platform
::
MKLDNNDeviceContext
::
tls
().
get_stream
();
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
this
->
attr_
)
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
attr_
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wh_key
,
memory_p
);
...
...
@@ -183,7 +348,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireBiasMemory
(
const
Tensor
*
bias
,
const
bool
origin_mode
)
{
const
std
::
string
bias_key
=
this
->
memory_key_
+
"@bias"
;
const
std
::
string
bias_key
=
memory_key_
+
"@bias"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
bias_key
));
...
...
@@ -194,15 +359,15 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if
(
bias
)
{
const
float
*
user_bias_data
=
bias
->
data
<
float
>
();
// Bias in oneDNN is always float
memcpy
(
bias_data
,
user_bias_data
,
sizeof
(
float
)
*
this
->
G
*
this
->
OC
);
memcpy
(
bias_data
,
user_bias_data
,
sizeof
(
float
)
*
3
*
OC
);
}
else
{
// oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0
memset
(
bias_data
,
0
,
sizeof
(
float
)
*
this
->
G
*
this
->
OC
);
memset
(
bias_data
,
0
,
sizeof
(
float
)
*
3
*
OC
);
}
if
(
origin_mode
==
false
&&
bias
)
{
for
(
int64_t
i
=
0
;
i
<
this
->
OC
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
OC
;
++
i
)
{
bias_data
[
i
]
*=
-
1
;
}
}
...
...
@@ -210,6 +375,19 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
}
return
memory_p
;
}
private:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
const
int64_t
N
,
Ti
,
IC
,
OC
;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std
::
string
memory_key_
;
dnnl
::
primitive_attr
attr_
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
已删除
100644 → 0
浏览文件 @
7fbc68a2
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUDeviceContext
;
using
paddle
::
platform
::
CreateKey
;
using
paddle
::
platform
::
MKLDNNGetDataType
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
platform
::
to_void_cast
;
template
<
typename
T
,
typename
T_out
=
T
>
class
LSTMMKLDNNHandler
:
public
RNNMKLDNNHandler
<
T
,
dnnl
::
lstm_forward
,
T_out
>
{
public:
LSTMMKLDNNHandler
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
const
mkldnn
::
engine
mkldnn_engine
,
platform
::
Place
cpu_place
,
const
LoDTensor
*
input
,
const
Tensor
*
weight_h
,
const
Tensor
*
h0
,
const
Tensor
*
c0
,
const
bool
is_reverse
,
const
int64_t
N
,
const
int64_t
Ti
,
const
int64_t
IC
,
const
int64_t
OC
,
const
std
::
string
&
unique_name
)
:
RNNMKLDNNHandler
<
T
,
dnnl
::
lstm_forward
,
T_out
>
(
ctx
,
dev_ctx
,
mkldnn_engine
,
ctx
.
GetPlace
(),
input
,
weight_h
,
h0
,
is_reverse
,
N
,
Ti
,
IC
,
OC
,
4
,
ctx
.
InputName
(
"X"
)
+
ctx
.
InputName
(
"WeightH"
))
{
if
(
!
this
->
isCached
())
{
const
bool
is_INT8
=
std
::
is_same
<
T
,
uint8_t
>::
value
;
const
bool
use_peepholes
=
ctx
.
Attr
<
bool
>
(
"use_peepholes"
);
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
),
"sigmoid"
,
platform
::
errors
::
Unimplemented
(
"oneDNN fusion_lstm supports only "
"sigmoid as a gate activation."
));
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
),
"tanh"
,
platform
::
errors
::
Unimplemented
(
"oneDNN fusion_lstm supports only tanh as a cell activation."
));
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
),
"tanh"
,
platform
::
errors
::
Unimplemented
(
"oneDNN fusion_lstm supports only tanh a candidate activation."
));
// Weights for int8 kernel are of a type s8
const
auto
weights_dt
=
is_INT8
?
dnnl
::
memory
::
data_type
::
s8
:
MKLDNNGetDataType
<
T
>
();
// oneDNN RNN dimensions
const
int64_t
D
=
1
;
// Directions
const
int64_t
L
=
1
;
// Layers (PP supports only 1 stacked layer)
const
int64_t
G
=
4
;
// Number of Gates, 4 for LSTM
// Create memory descriptors
auto
input_md
=
MKLDNNMemDesc
({
Ti
,
N
,
IC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
tnc
);
auto
weight_x_md
=
MKLDNNMemDesc
({
L
,
D
,
IC
,
G
,
OC
},
weights_dt
,
MKLDNNMemoryFormat
::
any
);
auto
weight_h_md
=
MKLDNNMemDesc
({
L
,
D
,
OC
,
G
,
OC
},
weights_dt
,
MKLDNNMemoryFormat
::
any
);
auto
bias_md
=
MKLDNNMemDesc
({
L
,
D
,
G
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldgo
);
auto
hidden_md
=
MKLDNNMemDesc
({
Ti
,
N
,
OC
},
MKLDNNGetDataType
<
T_out
>
(),
MKLDNNMemoryFormat
::
tnc
);
auto
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
ldnc
);
auto
c0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
ldnc
);
// Create LSTM oneDNN primitive
const
auto
direction
=
is_reverse
?
dnnl
::
rnn_direction
::
unidirectional_right2left
:
dnnl
::
rnn_direction
::
unidirectional_left2right
;
if
(
!
use_peepholes
)
{
this
->
AcquireForwardPrimitiveDescriptor
(
this
->
attr_
,
dnnl
::
prop_kind
::
forward_inference
,
direction
,
input_md
,
h0_md
,
c0_md
,
weight_x_md
,
weight_h_md
,
bias_md
,
hidden_md
,
dnnl
::
memory
::
desc
(),
dnnl
::
memory
::
desc
());
}
else
{
auto
weight_peephole_md
=
MKLDNNMemDesc
({
L
,
D
,
3
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldgo
);
this
->
AcquireForwardPrimitiveDescriptor
(
this
->
attr_
,
dnnl
::
prop_kind
::
forward_inference
,
direction
,
input_md
,
h0_md
,
c0_md
,
weight_x_md
,
weight_h_md
,
weight_peephole_md
,
bias_md
,
hidden_md
,
dnnl
::
memory
::
desc
(),
dnnl
::
memory
::
desc
());
}
}
}
// PaddlePaddle has different order of weights than oneDNN, so a reorder is
// needed
// PaddlePaddle: {c, i, f, o}
// oneDNN: {i, f, c, o}
void
ReorderGates
(
float
*
weights
,
int64_t
I
)
{
size_t
inner_block_size
=
this
->
OC
;
size_t
block_size
=
inner_block_size
*
this
->
G
;
for
(
size_t
i
=
0
;
i
<
(
size_t
)
I
;
++
i
)
{
size_t
offset
=
i
*
block_size
;
float
*
base_pos
=
weights
+
offset
;
std
::
swap_ranges
(
base_pos
,
base_pos
+
inner_block_size
,
base_pos
+
inner_block_size
);
// c <-> i
std
::
swap_ranges
(
base_pos
+
inner_block_size
,
base_pos
+
2
*
inner_block_size
,
base_pos
+
2
*
inner_block_size
);
// c <-> f
}
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
)
{
const
std
::
string
wx_key
=
this
->
memory_key_
+
"@weight_x"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wx_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
IC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
float
)
*
this
->
IC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_x_data
,
this
->
IC
);
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_layer_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
this
->
attr_
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wx_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
)
{
const
std
::
string
wh_key
=
this
->
memory_key_
+
"@weight_h"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wh_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
this
->
OC
,
this
->
G
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_h_data
,
weight_h
->
data
<
float
>
(),
sizeof
(
float
)
*
this
->
OC
*
this
->
G
*
this
->
OC
);
ReorderGates
(
weight_h_data
,
this
->
OC
);
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_iter_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_memory
,
*
memory_p
,
this
->
attr_
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wh_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireBiasMemory
(
const
Tensor
*
bias
)
{
const
std
::
string
bias_key
=
this
->
memory_key_
+
"@bias"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
bias_key
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
bias_desc
(),
this
->
engine_
);
auto
*
bias_data
=
reinterpret_cast
<
float
*>
(
memory_p
->
get_data_handle
());
if
(
bias
)
{
const
float
*
user_bias_data
=
bias
->
data
<
float
>
();
// Bias in oneDNN is always float
memcpy
(
bias_data
,
user_bias_data
,
sizeof
(
float
)
*
this
->
G
*
this
->
OC
);
ReorderGates
(
bias_data
,
1
);
}
else
{
// oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0
memset
(
bias_data
,
0
,
sizeof
(
float
)
*
this
->
G
*
this
->
OC
);
}
this
->
dev_ctx_
.
SetBlob
(
bias_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquirePeepholeWeights
(
const
Tensor
*
bias
)
{
const
std
::
string
peepholes_key
=
this
->
memory_key_
+
"@peepholes_weights"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
peepholes_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
3
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldgo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_peephole_desc
(),
this
->
engine_
);
auto
*
peephole_weights_data
=
reinterpret_cast
<
float
*>
(
memory_p
->
get_data_handle
());
const
float
*
user_bias_data
=
bias
->
data
<
float
>
();
// Bias in oneDNN is always float
memcpy
(
peephole_weights_data
,
user_bias_data
+
4
*
this
->
OC
,
sizeof
(
float
)
*
3
*
this
->
OC
);
this
->
dev_ctx_
.
SetBlob
(
peepholes_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireC0Memory
(
const
Tensor
*
c0
)
{
const
std
::
string
c0_key
=
this
->
memory_key_
+
"@c0"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
c0_key
));
if
(
!
memory_p
)
{
auto
user_c0_memory
=
dnnl
::
memory
();
if
(
c0
)
{
user_c0_memory
=
dnnl
::
memory
({{
1
,
1
,
this
->
N
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
,
to_void_cast
(
c0
->
data
<
float
>
()));
}
else
{
user_c0_memory
=
dnnl
::
memory
({{
1
,
1
,
this
->
N
,
this
->
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
);
memset
(
user_c0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
this
->
N
*
this
->
OC
);
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_c0_memory
,
*
memory_p
,
this
->
attr_
)
.
execute
(
astream
,
user_c0_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
c0_key
,
memory_p
);
}
return
memory_p
;
}
};
template
<
typename
T
>
class
FusionLSTMMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
RunKernel
<
float
>
(
ctx
);
}
template
<
typename
Tout
=
T
>
void
RunKernel
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
// Get Tensors
const
auto
*
input
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
const
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
const
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
const
auto
*
weight_x
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
const
auto
*
weight_h
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
hidden
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
cell
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
cell
=
cell
;
auto
x_dims
=
input
->
dims
();
auto
x_mat_dims
=
(
x_dims
.
size
()
==
3
&&
x_dims
[
1
]
==
1
)
?
framework
::
flatten_to_2d
(
x_dims
,
1
)
:
x_dims
;
// Get attributes
const
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
const
bool
use_peepholes
=
ctx
.
Attr
<
bool
>
(
"use_peepholes"
);
// Get tensor dimensions
const
auto
x_mat_dims_vec
=
framework
::
vectorize
(
x_mat_dims
);
const
auto
weight_h_dims
=
framework
::
vectorize
(
weight_h
->
dims
());
const
auto
&
input_lod
=
input
->
lod
()[
0
];
// Calculate RNN dimensions
const
int64_t
N
=
input_lod
.
size
()
-
1
;
// Number of sentences (batches)
const
int64_t
Ti
=
// Max length of the sentence in a batch
[
&
input_lod
]()
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
(
input_lod
.
size
()
-
1
);
++
i
)
{
res
=
std
::
max
(
res
,
input_lod
[
i
+
1
]
-
input_lod
[
i
]);
}
return
res
;
}();
const
int64_t
IC
=
x_mat_dims_vec
[
1
];
// Input channels
const
int64_t
OC
=
weight_h_dims
[
0
];
// Output channels
LSTMMKLDNNHandler
<
T
,
Tout
>
handler
(
ctx
,
dev_ctx
,
mkldnn_engine
,
ctx
.
GetPlace
(),
input
,
weight_h
,
h0
,
c0
,
is_reverse
,
N
,
Ti
,
IC
,
OC
,
ctx
.
InputName
(
"X"
)
+
ctx
.
InputName
(
"WeightH"
));
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
auto
c0_memory_p
=
handler
.
AcquireC0Memory
(
c0
);
auto
weight_x_memory_p
=
handler
.
AcquireWeightXMemory
(
weight_x
);
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
std
::
unordered_map
<
int
,
dnnl
::
memory
>
lstm_args
=
{
{
DNNL_ARG_SRC_LAYER
,
*
input_memory_p
},
{
DNNL_ARG_SRC_ITER
,
*
h0_memory_p
},
{
DNNL_ARG_SRC_ITER_C
,
*
c0_memory_p
},
{
DNNL_ARG_WEIGHTS_LAYER
,
*
weight_x_memory_p
},
{
DNNL_ARG_WEIGHTS_ITER
,
*
weight_h_memory_p
},
{
DNNL_ARG_BIAS
,
*
bias_memory_p
},
{
DNNL_ARG_DST_LAYER
,
*
hidden_onednn_memory_p
}};
if
(
use_peepholes
)
{
auto
peephole_weight_p
=
handler
.
AcquirePeepholeWeights
(
bias
);
std
::
pair
<
int
,
dnnl
::
memory
>
peepholes_weights
(
DNNL_ARG_WEIGHTS_PEEPHOLE
,
*
peephole_weight_p
);
lstm_args
.
insert
(
peepholes_weights
);
}
auto
lstm_forward_p
=
handler
.
AcquireForwardPrimitive
();
dnnl
::
stream
astream
(
mkldnn_engine
);
lstm_forward_p
->
execute
(
astream
,
lstm_args
);
astream
.
wait
();
auto
*
hidden_onednn_data
=
hidden_onednn_memory_p
->
get_data_handle
();
auto
*
hidden_data
=
to_void_cast
(
hidden
->
mutable_data
<
Tout
>
(
ctx
.
GetPlace
()));
if
(
handler
.
is_NTC
())
{
handler
.
reorderRNNdata
(
hidden_onednn_data
,
hidden_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
NTC_PP
);
}
else
{
handler
.
reorderRNNdata
(
hidden_onednn_data
,
hidden_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
TNC_PP
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
fusion_lstm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
FusionLSTMMKLDNNKernel
<
float
>
);
paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
已删除
100644 → 0
浏览文件 @
7fbc68a2
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUDeviceContext
;
using
paddle
::
platform
::
CreateKey
;
using
paddle
::
platform
::
MKLDNNGetDataType
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
platform
::
to_void_cast
;
template
<
typename
T
,
typename
T_alg
,
typename
T_out
=
T
>
class
RNNMKLDNNHandler
:
public
platform
::
MKLDNNHandlerT
<
T
,
T_alg
>
{
public:
RNNMKLDNNHandler
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
const
mkldnn
::
engine
mkldnn_engine
,
platform
::
Place
cpu_place
,
const
LoDTensor
*
input
,
const
Tensor
*
weight_h
,
const
Tensor
*
h0
,
const
bool
is_reverse
,
const
int64_t
N
,
const
int64_t
Ti
,
const
int64_t
IC
,
const
int64_t
OC
,
const
int64_t
G
,
const
std
::
string
&
unique_name
)
:
platform
::
MKLDNNHandlerT
<
T
,
T_alg
>
(
dev_ctx
,
dev_ctx
.
GetEngine
(),
cpu_place
,
CreateKey
(
dev_ctx
,
unique_name
,
MKLDNNGetDataType
<
T
>
(),
Ti
)),
N
(
N
),
Ti
(
Ti
),
IC
(
IC
),
OC
(
OC
),
G
(
G
)
{
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
memory_key_
=
platform
::
ExtendKeyWithThreadInfoIfNeeded
(
dev_ctx
,
CreateKey
(
dev_ctx
,
unique_name
,
MKLDNNGetDataType
<
T
>
()));
// Is it int8 kernel
const
bool
is_INT8
=
std
::
is_same
<
T
,
uint8_t
>::
value
;
if
(
is_INT8
)
{
// Int8 attributes
const
float
scale_data
=
ctx
.
Attr
<
float
>
(
"Scale_data"
);
const
float
shift_data
=
ctx
.
Attr
<
float
>
(
"Shift_data"
);
const
auto
scale_weights
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
const
int
weights_scale_mask
=
0
+
(
1
<<
3
)
// bit, indicating the unique scales for `g` dim in `ldigo`
+
(
1
<<
4
);
// bit, indicating the unique scales for `o` dim in `ldigo`
attr_
.
set_rnn_data_qparams
(
scale_data
,
shift_data
);
attr_
.
set_rnn_weights_qparams
(
weights_scale_mask
,
scale_weights
);
}
}
bool
is_NTC
()
{
return
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
dst_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
);
}
void
reorderRNNdata
(
void
*
input_data
,
void
*
output_data
,
std
::
vector
<
size_t
>
lod
,
const
bool
is_reverse
,
platform
::
RNNReorderType
reorder_type
)
{
switch
(
reorder_type
)
{
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case
platform
::
RNNReorderType
::
PP_NTC
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
IC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
IC
-
num_elements
)
:
0
;
memcpy
(
output_data_iter
+
n
*
Ti
*
IC
+
offset
,
input_data_iter
,
sizeof
(
T
)
*
num_elements
);
input_data_iter
+=
num_elements
;
}
}
break
;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case
platform
::
RNNReorderType
::
PP_TNC
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
]);
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data_iter
+
(
t
+
offset
)
*
N
*
IC
+
n
*
IC
,
input_data_iter
,
sizeof
(
T
)
*
IC
);
input_data_iter
+=
IC
;
}
}
}
break
;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
NTC_PP
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T_out
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T_out
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
OC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
OC
-
num_elements
)
:
0
;
memcpy
(
output_data_iter
,
input_data_iter
+
n
*
Ti
*
OC
+
offset
,
sizeof
(
T_out
)
*
num_elements
);
output_data_iter
+=
num_elements
;
}
}
break
;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
TNC_PP
:
{
auto
*
input_data_iter
=
reinterpret_cast
<
T_out
*>
(
input_data
);
auto
*
output_data_iter
=
reinterpret_cast
<
T_out
*>
(
output_data
);
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
lod
[
n
+
1
]
-
lod
[
n
];
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data_iter
,
input_data_iter
+
(
t
+
offset
)
*
N
*
OC
+
n
*
OC
,
sizeof
(
T_out
)
*
OC
);
output_data_iter
+=
OC
;
}
}
}
break
;
}
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireInputMemoryWithReorder
(
const
LoDTensor
*
input
,
const
bool
is_reverse
)
{
const
auto
name
=
this
->
key_
+
"@input_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
const
auto
&
input_lod
=
input
->
lod
()[
0
];
auto
*
x_data
=
to_void_cast
(
input
->
data
<
T
>
());
auto
*
x_onednn_data
=
memory_p
->
get_data_handle
();
memset
(
x_onednn_data
,
0
,
sizeof
(
T
)
*
N
*
Ti
*
IC
);
if
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
src_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
)
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_NTC
);
}
else
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_TNC
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireOutputMemory
()
{
const
auto
name
=
this
->
key_
+
"@output_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
dst_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
return
memory_p
;
}
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
h0_key
));
if
(
!
memory_p
)
{
auto
user_h0_memory
=
dnnl
::
memory
();
if
(
h0
)
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
,
to_void_cast
(
h0
->
data
<
float
>
()));
}
else
{
user_h0_memory
=
dnnl
::
memory
({{
1
,
1
,
N
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldnc
},
this
->
engine_
);
memset
(
user_h0_memory
.
get_data_handle
(),
0
,
sizeof
(
float
)
*
N
*
OC
);
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_iter_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_h0_memory
,
*
memory_p
,
attr_
)
.
execute
(
astream
,
user_h0_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
h0_key
,
memory_p
);
}
return
memory_p
;
}
protected:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
// G - Number of gates
const
int64_t
N
,
Ti
,
IC
,
OC
,
G
;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std
::
string
memory_key_
;
dnnl
::
primitive_attr
attr_
;
};
}
// namespace operators
}
// namespace paddle
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
浏览文件 @
824a79d3
...
...
@@ -75,6 +75,4 @@ class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
已删除
100644 → 0
浏览文件 @
7fbc68a2
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.test_fusion_lstm_op
import
TestFusionLSTMOp
class
TestFusionLSTMONEDNNOp
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_mkldnn
=
True
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
(
check_dygraph
=
False
,
no_check_set
=
[
"Cell"
])
class
TestFusionLSTMONEDNNOpReverse
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
is_reverse
=
True
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpInitReverse
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
has_initial_state
=
True
self
.
is_reverse
=
True
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpMD1
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
M
=
36
self
.
D
=
8
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpMD2
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
M
=
8
self
.
D
=
8
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpMD3
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
M
=
15
self
.
D
=
3
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpBS1
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
3
]]
self
.
D
=
16
self
.
use_mkldnn
=
True
class
TestFusionLSTMONEDNNOpPeepholesInit
(
TestFusionLSTMONEDNNOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
self
.
has_initial_state
=
True
self
.
use_mkldnn
=
True
if
__name__
==
'__main__'
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
浏览文件 @
824a79d3
...
...
@@ -144,6 +144,4 @@ class TestFusionGRUOpBS1(TestFusionGRUOp):
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
浏览文件 @
824a79d3
...
...
@@ -58,7 +58,6 @@ class TestFusionLSTMOp(OpTest):
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
use_mkldnn
=
False
self
.
set_conf
()
T
=
sum
(
self
.
lod
[
0
])
...
...
@@ -111,8 +110,7 @@ class TestFusionLSTMOp(OpTest):
'is_reverse'
:
self
.
is_reverse
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
,
'use_mkldnn'
:
self
.
use_mkldnn
'candidate_activation'
:
self
.
act_cand
}
def
test_check_output
(
self
):
...
...
@@ -193,6 +191,4 @@ class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
if
__name__
==
'__main__'
:
from
paddle
import
enable_static
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
浏览文件 @
824a79d3
...
...
@@ -29,5 +29,4 @@ no_check_set_white_list = [
'update_loss_scaling'
,
'cudnn_lstm'
,
'rnn'
,
'fusion_lstm'
,
]
tools/static_mode_white_list.py
浏览文件 @
824a79d3
...
...
@@ -601,7 +601,6 @@ STATIC_MODE_TESTING_LIST = [
'test_bilinear_interp_mkldnn_op'
,
'test_fusion_gru_int8_mkldnn_op'
,
'test_fusion_gru_mkldnn_op'
,
'test_fusion_lstm_mkldnn_op'
,
'test_gaussian_random_mkldnn_op'
,
'test_lrn_mkldnn_op'
,
'test_matmul_mkldnn_op'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录