Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
2d0ddf8c
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2d0ddf8c
编写于
8月 30, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine cpu gru batch mode
上级
70d39812
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
146 addition
and
154 deletion
+146
-154
paddle/fluid/operators/fusion_gru_op.cc
paddle/fluid/operators/fusion_gru_op.cc
+141
-149
paddle/fluid/operators/math/sequence2batch.h
paddle/fluid/operators/math/sequence2batch.h
+5
-5
未找到文件。
paddle/fluid/operators/fusion_gru_op.cc
浏览文件 @
2d0ddf8c
...
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
...
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/fusion_gru_op.h"
#include "paddle/fluid/operators/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightH) of GRU should not be null."
);
"Input(WeightH) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
BatchedGate
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
ReorderedH0
"
),
"Output(
BatchedGate
) of GRU should not be null."
);
"Output(
ReorderedH0
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batch
ResetHiddenPrev
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batch
edInput
"
),
"Output(Batch
ResetHiddenPrev
) of GRU should not be null."
);
"Output(Batch
edInput
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batched
Hidden
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batched
Out
"
),
"Output(Batched
Hidden
) of GRU should not be null."
);
"Output(Batched
Out
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
"Output(Hidden) of GRU should not be null."
);
"Output(Hidden) of GRU should not be null."
);
...
@@ -83,9 +80,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -83,9 +80,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
}
}
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedGate"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedInput"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedHidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedOut"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchResetHiddenPrev"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
...
@@ -115,22 +111,26 @@ void FusionGRUOpMaker::Make() {
...
@@ -115,22 +111,26 @@ void FusionGRUOpMaker::Make() {
"(Tensor) The FC weight with shape (M x 3D),"
"(Tensor) The FC weight with shape (M x 3D),"
"where M is the dim size of x, D is the hidden size. "
);
"where M is the dim size of x, D is the hidden size. "
);
AddInput
(
"WeightH"
,
AddInput
(
"WeightH"
,
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
);
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
"This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
"Acutally they are D x 2D and D x D two part weights."
"{W_update, W_reset; W_state}"
"{D x (D + D); D x D}"
);
AddInput
(
"Bias"
,
AddInput
(
"Bias"
,
"(Tensor, optional) (1 x 3D)."
"(Tensor, optional) (1 x 3D)."
"Almost same as GRUOp."
"Almost same as GRUOp."
"Note: if have FC bias it should be added on this bias."
)
"Note: if have FC bias it should be added on this bias."
)
.
AsDispensable
();
.
AsDispensable
();
AddOutput
(
"ReorderedH0"
,
"(Tensor) (N x D), which N is the min-batch size."
)
.
AsIntermediate
();
AddOutput
(
"XX"
,
AddOutput
(
"XX"
,
"(LoDTensor) the result after X * WeightX (size is T x
4
D)"
"(LoDTensor) the result after X * WeightX (size is T x
3
D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input."
)
" D is the hidden size, M is the dim size of x input."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"BatchedGate"
,
"(LoDTensor) Same as GRUOp"
).
AsIntermediate
();
AddOutput
(
"BatchedInput"
,
"(LoDTensor) (T x 3D)"
).
AsIntermediate
();
AddOutput
(
"BatchResetHiddenPrev"
,
"(LoDTensor) (T x 3D) Same as GRUOp."
)
AddOutput
(
"BatchedOut"
,
"(LoDTensor) (T X D) save batched hidden."
)
.
AsIntermediate
();
AddOutput
(
"BatchedHidden"
,
"(LoDTensor) (T X D) Same as GRUOp."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"Hidden"
,
"(LoDTensor) (T x D) Same as GRUOp"
);
AddOutput
(
"Hidden"
,
"(LoDTensor) (T x D) Same as GRUOp"
);
AddAttr
<
std
::
string
>
(
"activation"
,
AddAttr
<
std
::
string
>
(
"activation"
,
...
@@ -153,45 +153,53 @@ more details can refer to GRU op.
...
@@ -153,45 +153,53 @@ more details can refer to GRU op.
)DOC"
);
)DOC"
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
FusionGRUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FusionGRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
batched_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedGate"
);
auto
*
batched_input
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedInput"
);
auto
*
batch_reset_hidden_prev
=
auto
*
batched_out
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedOut"
);
ctx
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
auto
*
batch_hidden
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedHidden"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
T
*
batched_gate_data
=
batched_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
act_gate
,
act_state
;
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
function
<
void
(
const
int
,
const
T
,
const
T
*
,
T
*
)
>
bias_sub
;
batch_hidden
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
act_state_str
=
ctx
.
Attr
<
std
::
string
>
(
"activation"
);
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
math
::
VecActivations
<
T
,
platform
::
jit
::
avx
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_state
=
act_functor
(
act_state_str
);
bias_sub
=
math
::
vec_bias_sub
<
T
,
platform
::
jit
::
avx
>
;
}
else
{
math
::
VecActivations
<
T
,
platform
::
jit
::
isa_any
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_state
=
act_functor
(
act_state_str
);
bias_sub
=
math
::
vec_bias_sub
<
T
,
platform
::
jit
::
isa_any
>
;
}
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_input_data
=
batched_input
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_out_data
=
batched_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
x_dims
=
x
->
dims
();
auto
x_dims
=
x
->
dims
();
auto
wx_dims
=
wx
->
dims
();
auto
wx_dims
=
wx
->
dims
();
const
int
D3
=
wx_dims
[
1
];
const
int
D
=
D3
/
3
;
const
int
D2
=
D
*
2
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
...
@@ -199,125 +207,110 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -199,125 +207,110 @@ class FusionGRUKernel : public framework::OpKernel<T> {
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
x_data
,
wx_data
,
xx_data
,
x_data
,
wx_data
,
xx_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
bias
?
bias
->
data
<
T
>
()
:
NULL
);
to_batch
(
dev_ctx
,
*
xx
,
batched_
gate
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
xx
,
batched_
input
,
true
,
is_reverse
);
}
else
{
}
else
{
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
batched_
gate
->
set_lod
(
xx
->
lod
());
batched_
input
->
set_lod
(
xx
->
lod
());
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
xx_data
,
wx_data
,
batched_
gate
_data
,
xx_data
,
wx_data
,
batched_
input
_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
bias
?
bias
->
data
<
T
>
()
:
NULL
);
}
}
int
frame_size
=
static_cast
<
int
>
(
wx_dims
[
1
]
/
3
);
auto
batched_lod
=
batched_input
->
lod
();
math
::
GRUMetaValue
<
T
>
gru_value
;
const
auto
&
seq_order
=
batched_lod
[
2
];
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
wh_data
);
const
int
max_bs
=
seq_order
.
size
();
gru_value
.
state_weight
=
reordered_h0
->
Resize
({
max_bs
,
D
});
const_cast
<
T
*>
(
wh_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batched_gate
->
lod
()[
2
]);
int
tstart
=
0
;
T
*
prev_hidden_data
=
NULL
;
if
(
h0
)
{
if
(
h0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
// reorder h0
ctx
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
T
*
reordered_h0_data
=
reordered_h0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
true
);
const
T
*
h0_data
=
h0
->
data
<
T
>
();
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
prev_hidden_data
=
reordered_h0_data
;
size_t
sz
=
sizeof
(
T
)
*
D
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
std
::
memcpy
(
reordered_h0_data
,
h0_data
+
seq_order
[
i
]
*
D
,
sz
);
reordered_h0_data
+=
D
;
}
}
else
{
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
// compute without h0
T
*
cur_in_data
=
batched_input_data
;
T
*
cur_out_data
=
batched_out_data
;
// W: {W_update, W_reset; W_state}
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
// update gate
act_gate
(
D
,
cur_in_data
,
cur_in_data
);
// state gate
act_state
(
D
,
cur_in_data
+
D2
,
cur_in_data
+
D2
);
// out = a*b
blas
.
VMUL
(
D
,
cur_in_data
,
cur_in_data
+
D2
,
cur_out_data
);
// add offset
cur_in_data
+=
D3
;
cur_out_data
+=
D
;
}
tstart
=
1
;
prev_hidden_data
=
batched_out_data
;
}
}
auto
batch_starts
=
batched_gate
->
lod
()[
0
];
// Then start from next
size_t
seq_len
=
batch_starts
.
size
()
-
1
;
const
T
*
wh_state_data
=
wh_data
+
D
*
D2
;
auto
active_node
=
const
auto
&
batch_starts
=
batched_lod
[
0
];
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"activation"
));
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
auto
active_gate
=
math
::
detail
::
GetActivationType
(
batched_input_data
=
batched_input_data
+
tstart
*
max_bs
*
D3
;
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
batched_out_data
=
batched_out_data
+
tstart
*
max_bs
*
D
;
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
#ifdef PADDLE_WITH_MKLML
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
// use MKL packed to speedup GEMM
// gemm prev * (Wu + Wr)
if
(
FLAGS_paddle_num_threads
>=
4
)
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
cur_bs
,
D2
,
D
,
static_cast
<
T
>
(
1
),
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
batched_input_data
,
D3
);
frame_size
*
2
/*width of weight*/
,
frame_size
/*height of height*/
);
T
*
cur_batched_data
=
batched_input_data
;
PADDLE_ENFORCE
(
packed_gate
);
T
*
cur_prev_hidden_data
=
prev_hidden_data
;
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
*
2
,
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
frame_size
,
T
(
1.0
),
gru_value
.
gate_weight
,
frame_size
*
2
,
act_gate
(
D2
,
cur_batched_data
,
cur_batched_data
);
packed_gate
);
// rt = rt*ht_1 inplace result
T
*
packed_state
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
// TODO(TJ): try to save to cur out data
frame_size
/*width of weight*/
,
// maybe get benifits avoiding cache miss in next gemm
frame_size
/*height of height*/
);
blas
.
VMUL
(
D
,
cur_prev_hidden_data
,
cur_batched_data
+
D
,
PADDLE_ENFORCE
(
packed_state
);
cur_batched_data
+
D
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
,
frame_size
,
T
(
1.0
),
gru_value
.
state_weight
,
frame_size
,
cur_batched_data
+=
D3
;
packed_state
);
cur_prev_hidden_data
+=
D
;
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
*
2
,
frame_size
,
gru_value
.
prev_out_value
,
frame_size
,
packed_gate
,
frame_size
*
2
,
T
(
1
),
gru_value
.
gate_value
,
frame_size
*
3
);
}
math
::
detail
::
forward_reset_output
(
math
::
detail
::
forward
::
gru_resetOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_gate
);
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
,
frame_size
,
gru_value
.
reset_output_value
,
frame_size
,
packed_state
,
frame_size
,
T
(
1
),
gru_value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
);
}
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
blas
.
GEMM_FREE
(
packed_gate
);
cur_batched_data
=
batched_input_data
;
blas
.
GEMM_FREE
(
packed_state
);
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
cur_bs
,
D
,
D
,
static_cast
<
T
>
(
1
),
}
else
{
cur_batched_data
+
D
,
D3
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
#endif
cur_batched_data
+
D2
,
D3
);
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
T
*
cur_out_data
=
batched_out_data
;
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
cur_prev_hidden_data
=
prev_hidden_data
;
int
cur_batch_size
=
bend
-
bstart
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
// ht~ = act_state(...)
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
act_state
(
D
,
cur_batched_data
+
D2
,
cur_batched_data
+
D2
);
Tensor
reset_hidden_prev_t
=
// ht~~ = zt*ht~ inplace result
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
blas
.
VMUL
(
D
,
cur_batched_data
,
cur_batched_data
+
D2
,
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
cur_batched_data
+
D2
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
// zt = 1 - zt inplace result
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
bias_sub
(
D
,
static_cast
<
T
>
(
1
),
cur_batched_data
,
cur_batched_data
);
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
// zt = ht_1 * zt
blas
.
VMUL
(
D
,
cur_prev_hidden_data
,
cur_batched_data
,
cur_batched_data
);
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
// out = zt + ht~~
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
blas
.
VADD
(
D
,
cur_batched_data
,
cur_batched_data
+
D2
,
cur_out_data
);
active_gate
);
cur_batched_data
+=
D3
;
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
}
}
#ifdef PADDLE_WITH_MKLML
prev_hidden_data
=
batched_out_data
;
batched_out_data
=
cur_out_data
;
batched_input_data
=
cur_batched_data
;
}
}
#endif
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch
_hidden
->
set_lod
(
batched_gate
->
lod
()
);
batch
ed_out
->
set_lod
(
batched_lod
);
to_seq
(
dev_ctx
,
*
batch
_hidden
,
hidden_out
);
to_seq
(
dev_ctx
,
*
batch
ed_out
,
hidden_out
);
}
}
};
};
...
@@ -327,6 +320,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -327,6 +320,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_gru
,
ops
::
FusionGRUOp
,
ops
::
FusionGRUOpMaker
,
REGISTER_OPERATOR
(
fusion_gru
,
ops
::
FusionGRUOp
,
ops
::
FusionGRUOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
fusion_gru
,
ops
::
FusionGRUKernel
<
float
>
,
fusion_gru
,
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FusionGRUKernel
<
double
>
);
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/math/sequence2batch.h
浏览文件 @
2d0ddf8c
...
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
...
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
// Calculate the start position of each batch.
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
//
num_batch
= 5,
//
max_seqlen
= 5,
// batchIndex = {b0, b1, b2, b3, b4}
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
...
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
...
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
// where 1 is the second sequence,
// where 1 is the second sequence,
// 0 is the first sequence,
// 0 is the first sequence,
// 2 is the third sequence.
// 2 is the third sequence.
// The
num_batch
represents batch size after rearranging the
// The
max_seqlen
represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
// input LodTensor. It is also the maximum length of input sequence.
paddle
::
framework
::
LoD
batch_lods
;
paddle
::
framework
::
LoD
batch_lods
;
...
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
...
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
batch_lods
.
emplace_back
(
std
::
vector
<
size_t
>
{
0
});
batch_lods
.
emplace_back
(
std
::
vector
<
size_t
>
{
0
});
// batch_lods[0] is the start positions for batch LoDTensor
// batch_lods[0] is the start positions for batch LoDTensor
int
num_batch
=
seq_info
[
0
].
length
;
int
max_seqlen
=
seq_info
[
0
].
length
;
batch_lods
[
0
].
resize
(
static_cast
<
size_t
>
(
num_batch
+
1
));
batch_lods
[
0
].
resize
(
static_cast
<
size_t
>
(
max_seqlen
+
1
));
// batch_lods[1] is the raw index in the input LoDTensor
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods
[
1
].
resize
(
static_cast
<
size_t
>
(
lod_tensor
.
dims
()[
0
]));
batch_lods
[
1
].
resize
(
static_cast
<
size_t
>
(
lod_tensor
.
dims
()[
0
]));
// batch_lods[2] is the sort order for the input LoDTensor.
// batch_lods[2] is the sort order for the input LoDTensor.
...
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
...
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
size_t
*
batch_starts
=
batch_lods
[
0
].
data
();
size_t
*
batch_starts
=
batch_lods
[
0
].
data
();
size_t
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
size_t
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
batch_starts
[
0
]
=
0
;
batch_starts
[
0
]
=
0
;
for
(
int
n
=
0
;
n
<
num_batch
;
n
++
)
{
for
(
int
n
=
0
;
n
<
max_seqlen
;
n
++
)
{
auto
batch_id
=
static_cast
<
int
>
(
batch_starts
[
n
]);
auto
batch_id
=
static_cast
<
int
>
(
batch_starts
[
n
]);
for
(
size_t
i
=
0
;
i
<
seq_info
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
seq_info
.
size
();
++
i
)
{
int
seq_len
=
seq_info
[
i
].
length
;
int
seq_len
=
seq_info
[
i
].
length
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录