Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
8f913295
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8f913295
编写于
8月 15, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fuse fc in lstm
上级
ddb05dff
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
136 addition
and
94 deletion
+136
-94
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+136
-90
paddle/fluid/operators/fusion_lstm_op.h
paddle/fluid/operators/fusion_lstm_op.h
+0
-4
未找到文件。
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
8f913295
...
@@ -14,29 +14,37 @@ limitations under the License. */
...
@@ -14,29 +14,37 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
void
FusionLSTMOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
void
FusionLSTMOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of LSTM should not be null."
);
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"WeightX"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Weight"
),
"Input(WeightX) of LSTM should not be null."
);
"Input(Weight) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"WeightH"
),
"Input(WeightH) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
"Input(Bias) of LSTM should not be null."
);
"Input(Bias) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
"Output(Hidden) of LSTM should not be null."
);
"Output(Hidden) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Cell"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Cell"
),
"Output(Cell) of LSTM should not be null."
);
"Output(Cell) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchGate"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batch
ed
Gate"
),
"Output(BatchGate) of LSTM should not be null."
);
"Output(Batch
ed
Gate) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchCellPreAct"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchCellPreAct"
),
"Output(BatchGate) of LSTM should not be null."
);
"Output(Batch
ed
Gate) of LSTM should not be null."
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input
"
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X
"
);
PADDLE_ENFORCE_EQ
(
in
_dims
.
size
(),
2
,
"Input(X)'s rank must be 2."
);
PADDLE_ENFORCE_EQ
(
x
_dims
.
size
(),
2
,
"Input(X)'s rank must be 2."
);
if
(
ctx
->
HasInput
(
"H0"
))
{
if
(
ctx
->
HasInput
(
"H0"
))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"C0"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"C0"
),
...
@@ -49,15 +57,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -49,15 +57,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"should be the same."
);
"should be the same."
);
}
}
int
frame_size
=
in_dims
[
1
]
/
4
;
auto
wx_dims
=
ctx
->
GetInputDim
(
"WeightX"
);
auto
w_dims
=
ctx
->
GetInputDim
(
"Weight"
);
PADDLE_ENFORCE_EQ
(
wx_dims
.
size
(),
2
,
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2
,
"The rank of Input(Weight) should be 2."
);
"The rank of Input(WeightX) should be 2."
);
PADDLE_ENFORCE_EQ
(
w_dims
[
0
],
frame_size
,
PADDLE_ENFORCE_EQ
(
wx_dims
[
0
],
x_dims
[
1
],
"The first dimension of Input(Weight) "
"The first dimension of Input(WeightX) "
"should be %d."
,
x_dims
[
1
]);
int
frame_size
=
wx_dims
[
1
]
/
4
;
auto
wh_dims
=
ctx
->
GetInputDim
(
"WeightH"
);
PADDLE_ENFORCE_EQ
(
wh_dims
.
size
(),
2
,
"The rank of Input(WeightH) should be 2."
);
PADDLE_ENFORCE_EQ
(
wh_dims
[
0
],
frame_size
,
"The first dimension of Input(WeightH) "
"should be %d."
,
"should be %d."
,
frame_size
);
frame_size
);
PADDLE_ENFORCE_EQ
(
w_dims
[
1
],
4
*
frame_size
,
PADDLE_ENFORCE_EQ
(
w
h
_dims
[
1
],
4
*
frame_size
,
"The second dimension of Input(Weight) "
"The second dimension of Input(Weight
H
) "
"should be 4 * %d."
,
"should be 4 * %d."
,
frame_size
);
frame_size
);
...
@@ -66,36 +83,35 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -66,36 +83,35 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
1
,
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
1
,
"The first dimension of Input(Bias) should be 1."
);
"The first dimension of Input(Bias) should be 1."
);
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use_peepholes"
))
{
PADDLE_ENFORCE
(
!
ctx
->
Attrs
().
Get
<
bool
>
(
"use_peepholes"
),
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
7
*
frame_size
,
"Do not support peephole yet."
);
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection"
,
frame_size
);
}
else
{
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
4
*
frame_size
,
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
4
*
frame_size
,
"The second dimension of Input(Bias) should be "
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection"
,
"4 * %d if disable peepholes connection"
,
frame_size
);
frame_size
);
}
framework
::
DDim
out_dims
({
in
_dims
[
0
],
frame_size
});
framework
::
DDim
out_dims
({
x
_dims
[
0
],
frame_size
});
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Cell"
,
out_dims
);
ctx
->
SetOutputDim
(
"Cell"
,
out_dims
);
ctx
->
SetOutputDim
(
"Batch
Gate"
,
in_dims
);
ctx
->
SetOutputDim
(
"Batch
edGate"
,
{
x_dims
[
0
],
wx_dims
[
1
]}
);
ctx
->
SetOutputDim
(
"BatchCellPreAct"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchCellPreAct"
,
out_dims
);
ctx
->
ShareLoD
(
"Input"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"Input"
,
"Cell"
);
ctx
->
ShareLoD
(
"X"
,
"Cell"
);
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
ShareLoD
(
"X"
,
"XX"
);
}
}
framework
::
OpKernelType
FusionLSTMOp
::
GetExpectedKernelType
(
framework
::
OpKernelType
FusionLSTMOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"
Input
"
)
->
type
()),
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"
X
"
)
->
type
()),
ctx
.
device_context
());
ctx
.
device_context
());
}
}
void
FusionLSTMOpMaker
::
Make
()
{
void
FusionLSTMOpMaker
::
Make
()
{
AddInput
(
"
Input
"
,
AddInput
(
"
X
"
,
"(LoDTensor) the first input is a LodTensor, which support "
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where T is the "
"this LoDTensor is a matrix with shape (T X 4D), where T is the "
...
@@ -130,7 +146,12 @@ void FusionLSTMOpMaker::Make() {
...
@@ -130,7 +146,12 @@ void FusionLSTMOpMaker::Make() {
AddOutput
(
"Cell"
,
AddOutput
(
"Cell"
,
"(LoDTensor) the cell state of LSTM operator. "
"(LoDTensor) the cell state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`."
);
"The shape is (T x D), and lod is the same with the `Input`."
);
AddOutput
(
"BatchGate"
,
AddOutput
(
"XX"
,
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where T is the "
"total time steps in this mini-batch, D is the hidden size."
);
AddOutput
(
"BatchedGate"
,
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate after the nonlinear computation. This "
"and output gate after the nonlinear computation. This "
"LoDTensor has the same shape as the reorganized input, which "
"LoDTensor has the same shape as the reorganized input, which "
...
@@ -219,80 +240,102 @@ inline void ReorderInitState(const DeviceContext& ctx,
...
@@ -219,80 +240,102 @@ inline void ReorderInitState(const DeviceContext& ctx,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
// TODO(TJ): check mem copy perf
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
}
// TODO(TJ): can move to math::details
template
<
typename
DeviceContext
,
typename
T
>
inline
void
SimpleFC
(
const
math
::
BlasT
<
DeviceContext
,
T
>&
blas
,
const
int
M
,
const
int
N
,
const
int
K
,
const
T
*
A
,
const
T
*
B
,
T
*
C
,
const
T
*
bias_data
=
NULL
)
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
A
,
B
,
static_cast
<
T
>
(
0
),
C
);
if
(
bias_data
)
{
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
blas
.
AXPY
(
N
,
static_cast
<
T
>
(
1
),
bias_data
,
C
+
i
*
N
);
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
LSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Fuison
LSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
weight
=
ctx
.
Input
<
Tensor
>
(
"Weight"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
// x*4D
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
// D*4D
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
auto
*
batch_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchGate"
);
// the result after x*Wx (size: sum_words*4D) or batched_x (size:
batch_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// sum_words*x)
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
batched_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedGate"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_gate_data
=
batched_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
auto
x_dims
=
x
->
dims
();
auto
wx_dims
=
wx
->
dims
();
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
device_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
to_batch
(
device_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
// TODO(TJ): op test these two cases
auto
in_dims
=
input
->
dims
();
if
(
x_dims
[
1
]
>
wx_dims
[
1
])
{
int
frame_size
=
static_cast
<
int
>
(
in_dims
[
1
]
/
4
);
SimpleFC
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
x_data
,
framework
::
DDim
dims
({
in_dims
[
0
],
frame_size
});
wx_data
,
xx_data
,
bias
->
data
<
T
>
());
to_batch
(
dev_ctx
,
*
xx
,
batched_gate
,
true
,
is_reverse
);
if
(
bias
)
{
}
else
{
Tensor
b
=
*
bias
;
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
b
.
Resize
({
bias
->
numel
(),
1
});
SimpleFC
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
Tensor
gate_bias
=
b
.
Slice
(
0
,
4
*
frame_size
);
xx_data
,
wx_data
,
batched_gate_data
,
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
bias
->
data
<
T
>
());
add_bias
(
device_ctx
,
*
batch_gate
,
gate_bias
,
batch_gate
);
}
}
int
frame_size
=
static_cast
<
int
>
(
wx_dims
[
1
]
/
4
);
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
math
::
LstmMetaValue
<
T
>
lstm_value
;
math
::
LstmMetaValue
<
T
>
lstm_value
;
if
(
bias
&&
ctx
.
Attr
<
bool
>
(
"use_peepholes"
))
{
// no peephole
T
*
bias_data
=
const_cast
<
T
*>
(
bias
->
data
<
T
>
());
// the code style in LstmMetaValue will be updated later.
lstm_value
.
check_ig
=
bias_data
+
4
*
frame_size
;
lstm_value
.
check_fg
=
lstm_value
.
check_ig
+
frame_size
;
lstm_value
.
check_og
=
lstm_value
.
check_fg
+
frame_size
;
}
else
{
lstm_value
.
check_ig
=
nullptr
;
lstm_value
.
check_ig
=
nullptr
;
lstm_value
.
check_fg
=
nullptr
;
lstm_value
.
check_fg
=
nullptr
;
lstm_value
.
check_og
=
nullptr
;
lstm_value
.
check_og
=
nullptr
;
}
lstm_value
.
prev_state_value
=
nullptr
;
lstm_value
.
prev_state_value
=
nullptr
;
Tensor
ordered_c0
;
Tensor
ordered_c0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
framework
::
Vector
<
size_t
>
order
(
batch
ed
_gate
->
lod
()[
2
]);
if
(
cell_t0
)
{
if
(
cell_t0
)
{
// Since the batch computing for LSTM reorders the input sequence
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs
// according to their length. The initialized cell state also needs
// to reorder.
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
dev
ice_ctx
,
*
cell_t0
,
order
,
ReorderInitState
<
DeviceContext
,
T
>
(
dev
_ctx
,
*
cell_t0
,
order
,
&
ordered_c0
,
&
ordered_c0
,
true
);
true
);
lstm_value
.
prev_state_value
=
ordered_c0
.
data
<
T
>
();
lstm_value
.
prev_state_value
=
ordered_c0
.
data
<
T
>
();
}
}
// Use the local variable as here.
// Use the local variable as here.
LoDTensor
batch_hidden
,
batch_cell
;
LoDTensor
batch_hidden
,
batch_cell
;
auto
*
batch_cell_pre_act
=
ctx
.
Output
<
LoDTensor
>
(
"BatchCellPreAct"
);
auto
*
batch_cell_pre_act
=
ctx
.
Output
<
LoDTensor
>
(
"BatchCellPreAct"
);
batch_hidden
.
mutable_data
<
T
>
(
dims
,
ctx
.
GetPlace
());
batch_hidden
.
mutable_data
<
T
>
(
out_
dims
,
ctx
.
GetPlace
());
batch_cell
.
mutable_data
<
T
>
(
dims
,
ctx
.
GetPlace
());
batch_cell
.
mutable_data
<
T
>
(
out_
dims
,
ctx
.
GetPlace
());
batch_cell_pre_act
->
mutable_data
<
T
>
(
dims
,
ctx
.
GetPlace
());
batch_cell_pre_act
->
mutable_data
<
T
>
(
out_
dims
,
ctx
.
GetPlace
());
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
auto
batch_starts
=
batch
ed
_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
size_t
max_seq_len
=
batch_starts
.
size
()
-
1
;
auto
gate_act
=
math
::
detail
::
GetActivationType
(
auto
gate_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
auto
cell_act
=
math
::
detail
::
GetActivationType
(
auto
cell_act
=
math
::
detail
::
GetActivationType
(
...
@@ -300,12 +343,11 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -300,12 +343,11 @@ class LSTMKernel : public framework::OpKernel<T> {
auto
cand_act
=
math
::
detail
::
GetActivationType
(
auto
cand_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
));
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
));
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
device_ctx
);
for
(
size_t
n
=
0
;
n
<
max_seq_len
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
gate_t
=
batch
ed
_gate
->
Slice
(
bstart
,
bend
);
Tensor
out_t
=
batch_hidden
.
Slice
(
bstart
,
bend
);
Tensor
out_t
=
batch_hidden
.
Slice
(
bstart
,
bend
);
Tensor
cell_t
=
batch_cell
.
Slice
(
bstart
,
bend
);
Tensor
cell_t
=
batch_cell
.
Slice
(
bstart
,
bend
);
Tensor
cell_pre_act_t
=
batch_cell_pre_act
->
Slice
(
bstart
,
bend
);
Tensor
cell_pre_act_t
=
batch_cell_pre_act
->
Slice
(
bstart
,
bend
);
...
@@ -316,9 +358,11 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -316,9 +358,11 @@ class LSTMKernel : public framework::OpKernel<T> {
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_hidden_t
=
batch_hidden
.
Slice
(
pre_h_start
,
pre_h_end
);
auto
pre_hidden_t
=
batch_hidden
.
Slice
(
pre_h_start
,
pre_h_end
);
blas
.
MatMul
(
pre_hidden_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
// TODO(TJ): use gemm directly
blas
.
MatMul
(
pre_hidden_t
,
false
,
*
wh
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
else
if
(
hidden_t0
)
{
}
else
if
(
hidden_t0
)
{
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
...
@@ -327,10 +371,11 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -327,10 +371,11 @@ class LSTMKernel : public framework::OpKernel<T> {
// according to their length. The initialized hidden state also needs
// according to their length. The initialized hidden state also needs
// to reorder.
// to reorder.
Tensor
ordered_h0
;
Tensor
ordered_h0
;
ReorderInitState
<
DeviceContext
,
T
>
(
dev
ice
_ctx
,
*
hidden_t0
,
order
,
ReorderInitState
<
DeviceContext
,
T
>
(
dev_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
&
ordered_h0
,
true
);
blas
.
MatMul
(
ordered_h0
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
// TODO(TJ): use gemm directly
&
gate_t
,
static_cast
<
T
>
(
1.0
));
blas
.
MatMul
(
ordered_h0
,
false
,
*
wh
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
}
lstm_value
.
gate_value
=
gate_t
.
data
<
T
>
();
lstm_value
.
gate_value
=
gate_t
.
data
<
T
>
();
...
@@ -338,19 +383,19 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -338,19 +383,19 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev
ice_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
gate
_act
,
dev
_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell
_act
,
c
ell_act
,
c
and_act
);
cand_act
);
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
}
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
.
set_lod
(
batch_gate
->
lod
());
batch_hidden
.
set_lod
(
batch
ed
_gate
->
lod
());
// restore the output hidden in LoDTensor from the batch hidden
// restore the output hidden in LoDTensor from the batch hidden
to_seq
(
dev
ice
_ctx
,
batch_hidden
,
hidden_out
);
to_seq
(
dev_ctx
,
batch_hidden
,
hidden_out
);
batch_cell
.
set_lod
(
batch_gate
->
lod
());
batch_cell
.
set_lod
(
batch
ed
_gate
->
lod
());
// restore the output cell state in LoDTensor from the batch cell
// restore the output cell state in LoDTensor from the batch cell
to_seq
(
dev
ice
_ctx
,
batch_cell
,
cell_out
);
to_seq
(
dev_ctx
,
batch_cell
,
cell_out
);
}
}
};
};
...
@@ -358,9 +403,10 @@ class LSTMKernel : public framework::OpKernel<T> {
...
@@ -358,9 +403,10 @@ class LSTMKernel : public framework::OpKernel<T> {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
lstm
,
ops
::
LSTMOp
,
ops
::
LSTMOpMaker
,
REGISTER_OPERATOR
(
fusion_lstm
,
ops
::
FusionLSTMOp
,
ops
::
Fusion
LSTMOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
fusion_lstm
,
ops
::
LSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
fusion_lstm
,
ops
::
LSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
ops
::
FuisonLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FuisonLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/fusion_lstm_op.h
浏览文件 @
8f913295
...
@@ -15,10 +15,6 @@ limitations under the License. */
...
@@ -15,10 +15,6 @@ limitations under the License. */
#pragma once
#pragma once
// #include <string>
// #include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
// #include "paddle/fluid/operators/math/blas.h"
// #include "paddle/fluid/operators/math/detail/activation_functions.h"
// #include "paddle/fluid/operators/math/lstm_compute.h"
// #include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录