Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
085260f3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
085260f3
编写于
11月 27, 2020
作者:
J
Jack Zhou
提交者:
GitHub
11月 27, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add eigen gru and fix the dropout bug in the rnn
Add eigen gru and fix the dropout bug in the rnn
上级
545df287
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
346 addition
and
267 deletion
+346
-267
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+140
-38
paddle/fluid/operators/math/gru_compute.cc
paddle/fluid/operators/math/gru_compute.cc
+49
-5
paddle/fluid/operators/math/gru_compute.h
paddle/fluid/operators/math/gru_compute.h
+1
-1
paddle/fluid/operators/rnn_op.h
paddle/fluid/operators/rnn_op.h
+148
-217
python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+4
-4
python/paddle/fluid/tests/unittests/test_rnn_op.py
python/paddle/fluid/tests/unittests/test_rnn_op.py
+4
-2
未找到文件。
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
浏览文件 @
085260f3
...
...
@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/gru_compute.h"
...
...
@@ -21,6 +23,10 @@ namespace paddle {
namespace
operators
{
namespace
math
{
namespace
detail
{
using
Array1
=
Eigen
::
DSizes
<
int64_t
,
1
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
#ifndef __NVCC__
...
...
@@ -242,13 +248,35 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
#endif
}
template
<
typename
T
>
inline
void
forward_reset_outputV2
(
const
platform
::
CPUDeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
value_reset_gate
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
,
Array1
(
frame_size
));
auto
value_update_gate
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
+
frame_size
,
Array1
(
frame_size
));
auto
value_reset_output
=
typename
EigenVector
<
T
>::
Type
(
value
.
reset_output_value
,
Array1
(
frame_size
));
auto
value_reset_bias
=
typename
EigenVector
<
T
>::
ConstType
(
value
.
reset_bias
,
Array1
(
frame_size
));
SigmoidFunctor
<
T
>
()(
place
,
value_reset_gate
,
value_reset_gate
);
SigmoidFunctor
<
T
>
()(
place
,
value_update_gate
,
value_update_gate
);
value_reset_output
.
device
(
place
)
=
(
value_reset_output
+
value_reset_bias
)
*
value_reset_gate
;
}
template
<
class
OpResetOutput
,
typename
T
>
inline
void
forward_reset_output
(
OpResetOutput
op_reset_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_gat
e
,
bool
old_version
=
true
)
{
inline
void
forward_reset_output
(
OpResetOutput
op_reset_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_gate
,
bool
old_version
=
tru
e
,
const
platform
::
CPUDeviceContext
*
context
=
nullptr
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpResetOutput
::
avx
&&
(
frame_size
>
static_cast
<
int
>
(
8
-
1
))
&&
if
(
!
old_version
)
{
// use eigen
forward_reset_outputV2
(
*
context
,
value
,
frame_size
);
}
else
{
if
(
OpResetOutput
::
avx
&&
(
frame_size
&
static_cast
<
int
>
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_forward_reset_output
(
op_reset_output
,
value
.
gate_value
,
value
.
reset_output_value
,
...
...
@@ -260,6 +288,7 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
value
.
prev_out_value
,
frame_size
,
active_gate
,
old_version
,
value
.
reset_bias
);
}
}
value
.
gate_value
+=
frame_size
*
3
;
value
.
reset_output_value
+=
frame_size
;
if
(
value
.
prev_out_value
)
{
...
...
@@ -268,25 +297,51 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
}
}
template
<
typename
T
>
inline
void
forward_final_outputV2
(
const
platform
::
CPUDeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
value_update_gate
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
+
frame_size
,
Array1
(
frame_size
));
auto
value_frame_state
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
+
2
*
frame_size
,
Array1
(
frame_size
));
auto
value_output
=
typename
EigenVector
<
T
>::
Type
(
value
.
output_value
,
Array1
(
frame_size
));
TanhFunctor
<
T
>
()(
place
,
value_frame_state
,
value_frame_state
);
value_output
.
device
(
place
)
=
(
static_cast
<
T
>
(
1.0
)
-
value_update_gate
)
*
value_frame_state
;
if
(
value
.
prev_out_value
)
{
auto
value_prev_out
=
typename
EigenVector
<
T
>::
ConstType
(
value
.
prev_out_value
,
Array1
(
frame_size
));
value_output
.
device
(
place
)
=
value_output
+
value_update_gate
*
value_prev_out
;
}
}
template
<
class
OpFinalOutput
,
typename
T
>
inline
void
forward_final_output
(
OpFinalOutput
op_final_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
bool
origin_mode
,
bool
old_version
=
true
)
{
inline
void
forward_final_output
(
OpFinalOutput
op_final_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
bool
origin_mode
,
bool
old_version
=
true
,
const
platform
::
CPUDeviceContext
*
context
=
nullptr
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpFinalOutput
::
avx
&&
(
frame_size
>
static_cast
<
int
>
(
8
-
1
))
&&
if
(
!
old_version
)
{
// eigen
forward_final_outputV2
(
*
context
,
value
,
frame_size
);
}
else
{
if
(
OpFinalOutput
::
avx
&&
(
frame_size
&
static_cast
<
int
>
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
,
origin_mode
,
old_version
);
}
else
{
hl_naive_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
,
origin_mode
,
old_version
);
}
else
{
hl_naive_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
,
origin_mode
,
old_version
);
}
}
value
.
gate_value
+=
frame_size
*
3
;
value
.
output_value
+=
frame_size
;
if
(
value
.
prev_out_value
)
{
...
...
@@ -664,23 +719,70 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
}
}
template
<
typename
T
>
inline
void
gru_backward
(
const
platform
::
CPUDeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
)
{
auto
&
place
=
*
context
.
eigen_device
();
auto
value_reset_gate
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
,
Array1
(
frame_size
));
auto
grad_reset_gate
=
typename
EigenVector
<
T
>::
Type
(
grad
.
gate_grad
,
Array1
(
frame_size
));
auto
value_update_gate
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
+
frame_size
,
Array1
(
frame_size
));
auto
grad_update_gate
=
typename
EigenVector
<
T
>::
Type
(
grad
.
gate_grad
+
frame_size
,
Array1
(
frame_size
));
auto
value_frame_state
=
typename
EigenVector
<
T
>::
Type
(
value
.
gate_value
+
frame_size
*
2
,
Array1
(
frame_size
));
auto
grad_frame_state
=
typename
EigenVector
<
T
>::
Type
(
grad
.
gate_grad
+
frame_size
*
2
,
Array1
(
frame_size
));
auto
grad_output
=
typename
EigenVector
<
T
>::
Type
(
grad
.
output_grad
,
Array1
(
frame_size
));
auto
value_reset_output
=
typename
EigenVector
<
T
>::
Type
(
value
.
reset_output_value
,
Array1
(
frame_size
));
auto
grad_reset_output
=
typename
EigenVector
<
T
>::
Type
(
grad
.
reset_output_grad
,
Array1
(
frame_size
));
if
(
value
.
prev_out_value
)
{
auto
value_prev_out
=
typename
EigenVector
<
T
>::
ConstType
(
value
.
prev_out_value
,
Array1
(
frame_size
));
SigmoidGradFunctor
<
T
>
()(
place
,
1
/*useless*/
,
value_update_gate
,
(
value_prev_out
-
value_frame_state
)
*
grad_output
,
grad_update_gate
);
}
else
{
SigmoidGradFunctor
<
T
>
()(
place
,
1
/*useless*/
,
value_update_gate
,
static_cast
<
T
>
(
-
1
)
*
value_frame_state
*
grad_output
,
grad_update_gate
);
}
if
(
grad
.
prev_out_grad
)
{
auto
grad_prev_out
=
typename
EigenVector
<
T
>::
Type
(
grad
.
prev_out_grad
,
Array1
(
frame_size
));
grad_prev_out
.
device
(
place
)
=
grad_prev_out
+
grad_output
*
value_update_gate
;
}
TanhGradFunctor
<
T
>
()(
place
,
1
/*useless*/
,
value_frame_state
,
grad_output
*
(
static_cast
<
T
>
(
1.0
)
-
value_update_gate
),
grad_frame_state
);
SigmoidGradFunctor
<
T
>
()(
place
,
1
/*useless*/
,
value_reset_gate
,
value_reset_output
/
value_reset_gate
*
grad_frame_state
,
grad_reset_gate
);
if
(
value
.
prev_out_value
&&
grad
.
prev_out_grad
)
{
grad_reset_output
.
device
(
place
)
=
value_reset_gate
*
grad_frame_state
;
}
}
template
<
class
OpGruGrad
,
typename
T
>
inline
void
cpu_gru_backward
(
OpGruGrad
op_gru_grad
,
GRUMetaValue
<
T
>
value
,
inline
void
cpu_gru_backward
(
const
platform
::
CPUDeviceContext
&
context
,
OpGruGrad
op_gru_grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
,
ActivationType
active_gate
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
if
(
OpGruGrad
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_backward
(
op_gru_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
value
.
reset_output_value
,
grad
.
reset_output_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
active_gate
);
}
else
{
hl_naive_gru_backward
(
op_gru_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
value
.
reset_output_value
,
grad
.
reset_output_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
active_gate
);
}
// eigen
gru_backward
(
context
,
value
,
grad
,
frame_size
);
value
.
gate_value
+=
frame_size
*
3
;
value
.
reset_output_value
+=
frame_size
;
...
...
paddle/fluid/operators/math/gru_compute.cc
浏览文件 @
085260f3
...
...
@@ -42,7 +42,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
}
detail
::
forward_reset_output
(
detail
::
forward
::
gru_resetOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_gate
);
frame_size
,
batch_size
,
active_gate
,
true
,
&
context
);
if
(
value
.
prev_out_value
)
{
blas
.
GEMM
(
false
,
false
,
batch_size
,
frame_size
,
frame_size
,
1
,
...
...
@@ -53,7 +54,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
detail
::
forward_final_output
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
origin_mode
,
&
context
);
#endif
}
};
...
...
@@ -116,7 +117,8 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
value
.
reset_output_value
);
}
detail
::
forward_reset_output
(
detail
::
forward
::
gru_resetOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_gate
,
false
);
frame_size
,
batch_size
,
active_gate
,
false
,
&
context
);
T
*
cell_state_value
=
value
.
gate_value
+
2
*
frame_size
;
T
*
reset_output_value
=
value
.
reset_output_value
;
...
...
@@ -129,7 +131,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
detail
::
forward_final_output
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_node
,
true
,
false
);
false
,
&
context
);
#endif
}
};
...
...
@@ -144,8 +146,50 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
#ifndef __NVCC__
// calculate grad_update_gate, grad_frame_state,
// grad_reset_output, grad_reset_gate
detail
::
cpu_gru_backward
(
detail
::
backward
::
gru
<
T
>
(),
value
,
grad
,
detail
::
cpu_gru_backward
(
context
,
detail
::
backward
::
gru
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
active_node
,
active_gate
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
if
(
grad
.
prev_out_grad
&&
value
.
prev_out_value
)
{
// update prev_out_grad
blas
.
GEMM
(
false
,
false
,
batch_size
,
frame_size
,
frame_size
,
1
,
grad
.
gate_grad
,
frame_size
*
3
,
value
.
gate_weight
,
frame_size
,
1
,
grad
.
prev_out_grad
,
frame_size
);
blas
.
GEMM
(
false
,
false
,
batch_size
,
frame_size
,
frame_size
,
1
,
grad
.
gate_grad
+
frame_size
,
frame_size
*
3
,
value
.
gate_weight
+
frame_size
*
frame_size
,
frame_size
,
1
,
grad
.
prev_out_grad
,
frame_size
);
blas
.
GEMM
(
false
,
false
,
batch_size
,
frame_size
,
frame_size
,
1
,
grad
.
reset_output_grad
,
frame_size
,
value
.
state_weight
,
frame_size
,
1
,
grad
.
prev_out_grad
,
frame_size
);
// update weight_hh_grad
if
(
grad
.
gate_weight_grad
)
{
// reset gate
blas
.
GEMM
(
true
,
false
,
frame_size
,
frame_size
,
batch_size
,
1
,
grad
.
gate_grad
,
frame_size
*
3
,
value
.
prev_out_value
,
frame_size
,
1
,
grad
.
gate_weight_grad
,
frame_size
);
// update gate
blas
.
GEMM
(
true
,
false
,
frame_size
,
frame_size
,
batch_size
,
1
,
grad
.
gate_grad
+
frame_size
,
frame_size
*
3
,
value
.
prev_out_value
,
frame_size
,
1
,
grad
.
gate_weight_grad
+
frame_size
*
frame_size
,
frame_size
);
// cell state
blas
.
GEMM
(
true
,
false
,
frame_size
,
frame_size
,
batch_size
,
1
,
grad
.
reset_output_grad
,
frame_size
,
value
.
prev_out_value
,
frame_size
,
1
,
grad
.
state_weight_grad
,
frame_size
);
}
}
// update bias_hh_grad
T
*
gate_grad
=
grad
.
gate_grad
;
T
*
bias_hh_grad
=
grad
.
bias_hh_grad
;
T
*
state_bias_grad
=
grad
.
bias_hh_grad
+
2
*
frame_size
;
T
*
reset_output_grad
=
grad
.
reset_output_grad
;
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
blas
.
VADD
(
2
*
frame_size
,
bias_hh_grad
,
gate_grad
,
bias_hh_grad
);
blas
.
VADD
(
frame_size
,
state_bias_grad
,
reset_output_grad
,
state_bias_grad
);
gate_grad
+=
3
*
frame_size
;
reset_output_grad
+=
frame_size
;
}
#endif
}
};
...
...
paddle/fluid/operators/math/gru_compute.h
浏览文件 @
085260f3
...
...
@@ -38,7 +38,7 @@ struct GRUMetaGrad {
T
*
reset_output_grad
;
T
*
output_grad
;
T
*
prev_out_grad
;
T
*
state_bias
_grad
;
T
*
bias_hh
_grad
;
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
paddle/fluid/operators/rnn_op.h
浏览文件 @
085260f3
...
...
@@ -210,66 +210,58 @@ struct LSTMCell : Cell<T> {
}
};
template
<
typename
T
>
void
dropout_helper
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
x
,
Tensor
*
y
,
const
Tensor
*
mask
,
const
float
&
dropout_prob
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
dropout_mask
=
EigenVector
<
uint8_t
>::
Flatten
(
*
mask
);
auto
in
=
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
out
=
EigenVector
<
T
>::
Flatten
(
*
y
);
if
(
dropout_prob
==
1.0
f
)
{
out
.
device
(
place
)
=
static_cast
<
T
>
(
0
)
*
in
;
}
else
{
out
.
device
(
place
)
=
in
*
dropout_mask
.
cast
<
T
>
()
/
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
}
}
template
<
typename
T
>
void
dropout_cpu_function_inplace
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
x
,
Tensor
*
mask
,
Tensor
*
x
,
Tensor
*
y
,
Tensor
*
mask
,
const
float
&
dropout_prob
,
const
int
&
seed_number
,
const
bool
&
is_test
,
bool
*
is_has_reset
)
{
if
(
is_test
)
{
return
;
}
auto
*
x_data
=
x
->
data
<
T
>
();
size_t
size
=
framework
::
product
(
x
->
dims
());
auto
*
mask_data
=
mask
->
data
<
uint8_t
>
();
if
(
!
(
*
is_has_reset
))
{
// Special case when dropout_prob is 1.0
if
(
dropout_prob
==
1.0
f
)
{
std
::
fill
(
x_data
,
x_data
+
size
,
static_cast
<
T
>
(
0
));
std
::
fill
(
mask_data
,
mask_data
+
size
,
static_cast
<
T
>
(
0
));
*
is_has_reset
=
true
;
return
;
}
std
::
fill
(
mask_data
,
mask_data
+
size
,
static_cast
<
uint8_t
>
(
0
));
}
else
{
auto
engine
=
framework
::
GetCPURandomEngine
(
seed_number
);
std
::
uniform_real_distribution
<
float
>
dist
(
0
,
1
);
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
dist
(
*
engine
)
<
dropout_prob
)
{
mask_data
[
i
]
=
0
;
x_data
[
i
]
=
static_cast
<
T
>
(
0
);
}
else
{
mask_data
[
i
]
=
1
;
x_data
[
i
]
/=
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
}
}
*
is_has_reset
=
true
;
}
else
{
if
(
dropout_prob
==
1.0
f
)
{
std
::
fill
(
x_data
,
x_data
+
size
,
static_cast
<
T
>
(
0
));
return
;
}
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
mask_data
[
i
]
==
0
)
{
x_data
[
i
]
=
static_cast
<
T
>
(
0
);
}
else
{
x_data
[
i
]
/=
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
}
}
*
is_has_reset
=
true
;
}
dropout_helper
<
T
>
(
context
,
x
,
y
,
mask
,
dropout_prob
);
}
template
<
typename
T
>
void
dropout_cpu_grad_function_inplace
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
grad_x
,
const
Tensor
*
mask
,
const
float
&
dropout_prob
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
M
=
EigenVector
<
uint8_t
>::
Flatten
(
*
mask
);
auto
dX
=
EigenVector
<
T
>::
Flatten
(
*
grad_x
);
if
(
dropout_prob
==
1.0
f
)
{
dX
.
device
(
place
)
=
static_cast
<
T
>
(
0
)
*
dX
;
}
else
{
dX
.
device
(
place
)
=
dX
*
M
.
cast
<
T
>
()
/
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
}
dropout_helper
<
T
>
(
context
,
grad_x
,
grad_x
,
mask
,
dropout_prob
);
}
template
<
typename
T
,
typename
CellType
>
...
...
@@ -298,14 +290,13 @@ struct Layer {
blas
.
MatMul
(
*
input
,
mat_dim_a
,
weight
,
mat_dim_b
,
static_cast
<
T
>
(
1.0
),
cache_input
,
static_cast
<
T
>
(
0
));
auto
eigen_
in
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
in
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
cache_input
,
cache_input
->
dims
().
size
()
-
1
);
auto
eigen_bias_ih
=
framework
::
EigenMatrix
<
T
>::
From
(
auto
bias_ih_tmp
=
framework
::
EigenMatrix
<
T
>::
From
(
bias_ih
,
framework
::
make_ddim
({
1
,
bias_ih
.
dims
()[
0
]}));
const
int
&
row_num
=
framework
::
product
(
cache_input
->
dims
())
/
cache_input
->
dims
()[
2
];
eigen_in
=
eigen_in
+
eigen_bias_ih
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
in
=
in
+
bias_ih_tmp
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
if
(
is_gru
(
context
))
{
// reset_gate update_gate cell_gate = [1, 1, 0]
Tensor
bias_hh_tmp
;
...
...
@@ -317,15 +308,13 @@ struct Layer {
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
zero
;
zero
(
dev_ctx
,
&
bias_hh_tmp_unbind
[
2
],
static_cast
<
T
>
(
0.0
));
auto
eigen_bias_hh_tmp
=
framework
::
EigenMatrix
<
T
>::
From
(
auto
bias_hh_after_mask
=
framework
::
EigenMatrix
<
T
>::
From
(
bias_hh_tmp
,
framework
::
make_ddim
({
1
,
bias_hh
.
dims
()[
0
]}));
eigen_in
=
eigen_in
+
eigen_bias_hh_tmp
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
in
=
in
+
bias_hh_after_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
}
else
{
auto
eigen_bias_hh
=
framework
::
EigenMatrix
<
T
>::
From
(
auto
bias_hh_no_mask
=
framework
::
EigenMatrix
<
T
>::
From
(
bias_hh
,
framework
::
make_ddim
({
1
,
bias_hh
.
dims
()[
0
]}));
eigen_in
=
eigen_in
+
eigen_bias_hh
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
in
=
in
+
bias_hh_no_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
row_num
,
1
));
}
}
...
...
@@ -335,27 +324,26 @@ struct Layer {
// in the output, if mask flag is 0, we will retun the zero data
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
eigen_outp
ut
=
auto
o
ut
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
output
,
output
->
dims
().
size
()
-
1
);
auto
eigen_
mask
=
framework
::
EigenMatrix
<
T
>::
From
(
auto
mask
=
framework
::
EigenMatrix
<
T
>::
From
(
mask_tensor
,
framework
::
make_ddim
({
mask_tensor
.
dims
()[
1
],
1
}));
auto
eigen_init
_h
=
auto
pre
_h
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
init_h
,
init_h
->
dims
().
size
()
-
1
);
auto
eigen_last
_h
=
auto
curr
_h
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
last_h
,
last_h
->
dims
().
size
()
-
1
);
auto
eigen_mask_broadcast
=
eigen_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
output
->
dims
()[
2
]));
eigen_last_h
.
device
(
place
)
=
eigen_output
*
eigen_mask_broadcast
+
eigen_init_h
*
(
1
-
eigen_mask_broadcast
);
eigen_output
.
device
(
place
)
=
eigen_output
*
eigen_mask_broadcast
;
auto
mask_broadcast
=
mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
output
->
dims
()[
2
]));
curr_h
.
device
(
place
)
=
out
*
mask_broadcast
+
pre_h
*
(
1
-
mask_broadcast
);
out
.
device
(
place
)
=
out
*
mask_broadcast
;
if
(
is_lstm
(
context
))
{
auto
eigen_init
_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
pre
_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
init_c
,
init_c
->
dims
().
size
()
-
1
);
auto
eigen_last
_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
curr
_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
last_c
,
last_c
->
dims
().
size
()
-
1
);
eigen_last_c
.
device
(
place
)
=
eigen_last_c
*
eigen_mask_broadcast
+
eigen_init_c
*
(
1
-
eigen_
mask_broadcast
);
curr_c
.
device
(
place
)
=
curr_c
*
mask_broadcast
+
pre_c
*
(
1
-
mask_broadcast
);
}
}
...
...
@@ -910,16 +898,18 @@ void RnnFunc(const framework::ExecutionContext& ctx, const Tensor* input,
}
if
(
!
is_test
)
{
prev_hidden_data
=
hidden_data
.
Slice
(
i
-
1
,
i
);
input_holder
->
Resize
(
output
->
dims
());
if
(
dropout_prob
!=
0
)
{
dropout_cpu_function_inplace
<
T
>
(
ctx
,
&
prev_hidden_data
,
input_holder
,
dropout_mask
,
dropout_prob
,
seed
,
is_test
,
&
has_dropout_reset
);
}
else
{
input_holder
=
&
prev_hidden_data
;
input_holder
->
Resize
(
output
->
dims
());
}
}
else
{
SwapPoniter
(
&
output_holder
,
&
input_holder
);
}
if
(
dropout_prob
!=
0
&&
(
!
is_test
))
{
dropout_cpu_function_inplace
<
T
>
(
ctx
,
input_holder
,
dropout_mask
,
dropout_prob
,
seed
,
is_test
,
&
has_dropout_reset
);
}
}
const
Tensor
*
input_temp_holder
=
input
;
if
(
i
>
0
)
{
...
...
@@ -1040,53 +1030,6 @@ void create_tensor_by_list(const framework::ExecutionContext& context,
}
}
template
<
typename
T
>
void
make_grad_gate_buf
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
grad_gate
,
Tensor
*
grad_gate_buf
,
Tensor
*
reset_output_grad
=
nullptr
)
{
int
dim_size
=
grad_gate
->
dims
().
size
();
int
batch_size
=
grad_gate
->
dims
()[
dim_size
-
2
];
int
frame_size
=
grad_gate
->
dims
()[
dim_size
-
1
];
Tensor
grad_gate_mask
;
create_tensor_by_list
<
T
>
(
context
,
&
grad_gate_mask
,
{
1
,
1
,
0
});
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
eigen_grad_gate_mask
=
framework
::
EigenMatrix
<
T
>::
From
(
grad_gate_mask
,
framework
::
make_ddim
({
3
,
1
}));
auto
eigen_grad_gate_mask_broadcast
=
eigen_grad_gate_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
frame_size
/
3
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
frame_size
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
batch_size
,
1
));
auto
eigen_grad_gate_buf
=
framework
::
EigenMatrix
<
T
>::
From
(
*
grad_gate_buf
,
framework
::
make_ddim
({
batch_size
,
frame_size
}));
auto
eigen_grad_gate
=
framework
::
EigenMatrix
<
T
>::
From
(
*
grad_gate
,
framework
::
make_ddim
({
batch_size
,
frame_size
}));
eigen_grad_gate_buf
.
device
(
place
)
=
eigen_grad_gate
*
eigen_grad_gate_mask_broadcast
;
if
(
reset_output_grad
)
{
Tensor
grad_reset_output_mask
;
create_tensor_by_list
<
T
>
(
context
,
&
grad_reset_output_mask
,
{
0
,
0
,
1
});
auto
eigen_grad_reset_output_mask
=
framework
::
EigenMatrix
<
T
>::
From
(
grad_reset_output_mask
,
framework
::
make_ddim
({
3
,
1
}));
auto
eigen_grad_reset_output_mask_broadcast
=
eigen_grad_reset_output_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
frame_size
/
3
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
frame_size
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
batch_size
,
1
));
auto
eigen_grad_reset_output
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
reset_output_grad
,
reset_output_grad
->
dims
().
size
()
-
1
)
.
broadcast
(
Eigen
::
DSizes
<
int
,
3
>
(
1
,
3
,
1
))
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
batch_size
,
frame_size
));
eigen_grad_gate_buf
.
device
(
place
)
=
eigen_grad_gate_buf
+
eigen_grad_reset_output_mask_broadcast
*
eigen_grad_reset_output
;
}
}
template
<
typename
T
,
typename
GradCellType
>
struct
GradLayer
{
explicit
GradLayer
(
const
GradCellType
&
cell
)
:
cell_
(
cell
)
{}
...
...
@@ -1196,12 +1139,10 @@ struct GradLayer {
Tensor
*
pre_hidden
=
nullptr
;
Tensor
*
pre_state
=
nullptr
;
Tensor
*
hidden
=
nullptr
;
Tensor
grad_gate_buf
;
TensorList
grad_gate_buf_unbind
;
if
(
is_gru
(
context
))
{
grad_gate_buf
.
Resize
(
layer_grad_gate_tensor
->
dims
());
grad_gate_buf
.
mutable_data
<
T
>
(
context
.
GetPlace
());
grad_gate_buf_unbind
=
Unbind
(
grad_gate_buf
);
zero
(
device_ctx
,
&
((
*
weight_list_grad
)[
layer_idx
][
current_reverse_idx
*
4
+
3
]),
static_cast
<
T
>
(
0.0
)
);
}
for
(
int
i
=
time_step
-
1
;
i
>=
0
;
--
i
)
{
if
(
has_sequence_length
)
{
...
...
@@ -1232,7 +1173,7 @@ struct GradLayer {
&
(
parameter_lists
[
layer_idx
][
current_reverse_idx
*
4
+
1
]),
pre_hidden
,
pre_state
,
dynamic_grad_last_h
,
dynamic_grad_last_c
,
&
(
*
layer_grad_gate_tensor_unbind
)[
i
],
weight_grad
,
dynamic_grad_pre_h
,
dynamic_grad_pre_c
,
&
grad_gate_buf_unbind
[
i
],
dynamic_grad_pre_c
,
&
((
*
weight_list_grad
)[
layer_idx
][
current_reverse_idx
*
4
+
3
]),
mask_tensor_list
[
i
],
has_sequence_length
);
SwapPoniter
(
&
dynamic_grad_last_h
,
&
dynamic_grad_pre_h
);
...
...
@@ -1241,8 +1182,7 @@ struct GradLayer {
// postproces for gradient for w_hi, X, bias_hi, bias_hh
this
->
postprocess
(
context
,
*
layer_grad_gate_tensor
,
*
input
,
input_grad
,
parameter_lists
[
layer_idx
],
&
((
*
weight_list_grad
)[
layer_idx
]),
&
grad_gate_buf
,
is_reverse
);
&
((
*
weight_list_grad
)[
layer_idx
]),
is_reverse
);
// copy the gradient to init_c init_h
if
((
*
init_h_grad_unbind
).
size
()
>
0
&&
time_step
%
2
==
0
)
{
...
...
@@ -1268,16 +1208,17 @@ struct GradLayer {
TensorList
*
init_h_grad_unbind
,
TensorList
*
init_c_grad_unbind
,
const
std
::
vector
<
TensorList
>&
weight_list_grad
,
const
int
&
layer_idx
,
const
int
&
gate_num
)
{}
void
preprocess
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
grad_output
,
Tensor
*
grad_last_h
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
eigen_grad_output
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
output_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_output
,
grad_output
->
dims
().
size
()
-
1
);
auto
eigen_grad_last_h
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
last_h_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_last_h
,
grad_last_h
->
dims
().
size
()
-
1
);
// the output gradient contribute the gradient to last_h
eigen_grad_last_h
.
device
(
place
)
=
eigen_grad_last_h
+
eigen_grad_output
;
last_h_grad
.
device
(
place
)
=
last_h_grad
+
output_grad
;
}
void
mask_preprocess
(
const
framework
::
ExecutionContext
&
context
,
...
...
@@ -1286,40 +1227,35 @@ struct GradLayer {
Tensor
*
grad_pre_c
,
const
Tensor
&
mask_tensor
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
eigen_
mask
=
framework
::
EigenMatrix
<
T
>::
From
(
auto
mask
=
framework
::
EigenMatrix
<
T
>::
From
(
mask_tensor
,
framework
::
make_ddim
({
mask_tensor
.
dims
()[
1
],
1
}));
auto
eigen_
mask_broadcast
=
eigen_
mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
grad_output
->
dims
()[
2
]));
auto
mask_broadcast
=
mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
grad_output
->
dims
()[
2
]));
auto
eigen_grad_last_h
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
last_h_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_last_h
,
grad_last_h
->
dims
().
size
()
-
1
);
auto
eigen_grad_pre_h
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
pre_h_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_h
,
grad_pre_h
->
dims
().
size
()
-
1
);
auto
eigen_grad_output
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
output_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_output
,
grad_output
->
dims
().
size
()
-
1
);
eigen_grad_last_h
.
device
(
place
)
=
eigen_grad_last_h
+
eigen_grad_output
*
eigen_mask_broadcast
;
eigen_grad_pre_h
.
device
(
place
)
=
(
1
-
eigen_mask_broadcast
)
*
eigen_grad_last_h
;
eigen_grad_last_h
.
device
(
place
)
=
eigen_mask_broadcast
*
eigen_grad_last_h
;
last_h_grad
.
device
(
place
)
=
last_h_grad
+
output_grad
*
mask_broadcast
;
pre_h_grad
.
device
(
place
)
=
(
1
-
mask_broadcast
)
*
last_h_grad
;
last_h_grad
.
device
(
place
)
=
mask_broadcast
*
last_h_grad
;
if
(
grad_last_c
&&
grad_pre_c
&&
is_lstm
(
context
))
{
auto
eigen_grad_last_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
last_c_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_last_c
,
grad_last_c
->
dims
().
size
()
-
1
);
auto
eigen_grad_pre_c
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
auto
pre_c_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_c
,
grad_pre_c
->
dims
().
size
()
-
1
);
eigen_grad_pre_c
.
device
(
place
)
=
(
1
-
eigen_mask_broadcast
)
*
eigen_grad_last_c
;
eigen_grad_last_c
.
device
(
place
)
=
eigen_mask_broadcast
*
eigen_grad_last_c
;
pre_c_grad
.
device
(
place
)
=
(
1
-
mask_broadcast
)
*
last_c_grad
;
last_c_grad
.
device
(
place
)
=
mask_broadcast
*
last_c_grad
;
}
}
void
postprocess
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
&
grad_gate
,
const
Tensor
&
input
,
Tensor
*
input_grad
,
const
TensorList
&
parameters
,
TensorList
*
grad_parameters
,
Tensor
*
grad_gate_buf
,
const
int
&
is_reverse
)
{
TensorList
*
grad_parameters
,
const
int
&
is_reverse
)
{
// we get the grad_gate step by step, and need to bradocast the grad to the
// grad_w_hi, grad_bias_hi, grad_bias_hh
int
begin_idx
=
0
;
...
...
@@ -1360,10 +1296,7 @@ struct GradLayer {
{
grad_gate
.
dims
()[
0
]
*
grad_gate
.
dims
()[
1
],
grad_gate
.
dims
()[
2
]});
col_sum
(
device_ctx
,
tmp_grad_gate
,
&
((
*
grad_parameters
)[
begin_idx
+
2
]));
// Bias_hh
if
(
is_gru
(
context
))
{
grad_gate_buf
->
Resize
(
tmp_grad_gate
.
dims
());
col_sum
(
device_ctx
,
*
grad_gate_buf
,
&
((
*
grad_parameters
)[
begin_idx
+
3
]));
}
else
{
if
(
!
is_gru
(
context
))
{
col_sum
(
device_ctx
,
tmp_grad_gate
,
&
((
*
grad_parameters
)[
begin_idx
+
3
]));
}
}
...
...
@@ -1600,64 +1533,69 @@ struct GradCell {
Tensor
*
pre_state
,
Tensor
*
grad_hidden
,
Tensor
*
grad_state
,
Tensor
*
grad_gate
,
Tensor
*
grad_weight_hh
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_
gate_buf
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_
bias_hh
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
{}
void
postprocess_pre_hidden_grad
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_hidden_bak
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_pre_state_bak
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
{
if
(
has_sequence_length
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
mask
=
framework
::
EigenMatrix
<
T
>::
From
(
mask_tensor
,
framework
::
make_ddim
({
mask_tensor
.
dims
()[
1
],
1
}));
auto
mask_broadcast
=
mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
grad_pre_hidden
->
dims
()[
2
]));
auto
pre_hidden_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_hidden
,
grad_pre_hidden
->
dims
().
size
()
-
1
);
auto
pre_hidden_bak_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_hidden_bak
,
grad_pre_hidden_bak
->
dims
().
size
()
-
1
);
pre_hidden_grad
.
device
(
place
)
=
(
1
-
mask_broadcast
)
*
pre_hidden_bak_grad
+
pre_hidden_grad
*
mask_broadcast
;
if
(
grad_pre_state
)
{
auto
pre_state_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_state
,
grad_pre_state
->
dims
().
size
()
-
1
);
auto
pre_state_bak_grad
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_state_bak
,
grad_pre_state_bak
->
dims
().
size
()
-
1
);
pre_state_grad
.
device
(
place
)
=
(
1
-
mask_broadcast
)
*
pre_state_bak_grad
+
pre_state_grad
*
mask_broadcast
;
}
}
}
virtual
void
update_pre_hidden_grad
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
grad_gate
,
const
Tensor
*
weight_hh
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_hidden_bak
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_pre_state_bak
,
Tensor
*
grad_gate_buf
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
{
Tensor
*
grad_pre_state_bak
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
{
auto
&
device_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
device_ctx
);
T
beta
=
0
;
Tensor
*
grad_gate_tmp
=
grad_gate
;
if
(
is_gru
(
context
))
{
beta
=
1.0
;
grad_gate_tmp
=
grad_gate_buf
;
}
auto
mat_dim_a
=
math
::
CreateMatrixDescriptor
(
grad_gate_tmp
->
dims
(),
0
,
false
);
mat_dim_a
.
height_
*=
mat_dim_a
.
batch_size_
;
mat_dim_a
.
batch_size_
=
0
;
auto
mat_dim_b
=
math
::
CreateMatrixDescriptor
(
weight_hh
->
dims
(),
0
,
false
);
blas
.
MatMul
(
*
grad_gate_tmp
,
mat_dim_a
,
*
weight_hh
,
mat_dim_b
,
static_cast
<
T
>
(
1.0
),
grad_pre_hidden
,
beta
);
if
(
has_sequence_length
)
{
auto
&
place
=
*
context
.
template
device_context
<
platform
::
CPUDeviceContext
>()
.
eigen_device
();
auto
eigen_mask
=
framework
::
EigenMatrix
<
T
>::
From
(
mask_tensor
,
framework
::
make_ddim
({
mask_tensor
.
dims
()[
1
],
1
}));
auto
eigen_mask_broadcast
=
eigen_mask
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
grad_pre_hidden
->
dims
()[
2
]));
auto
eigen_grad_pre_hidden
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_hidden
,
grad_pre_hidden
->
dims
().
size
()
-
1
);
auto
eigen_grad_pre_hidden_bak
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_hidden_bak
,
grad_pre_hidden_bak
->
dims
().
size
()
-
1
);
eigen_grad_pre_hidden
.
device
(
place
)
=
(
1
-
eigen_mask_broadcast
)
*
eigen_grad_pre_hidden_bak
+
eigen_grad_pre_hidden
*
eigen_mask_broadcast
;
if
(
grad_pre_state
)
{
auto
eigen_grad_pre_state
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_state
,
grad_pre_state
->
dims
().
size
()
-
1
);
auto
eigen_grad_pre_state_bak
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
grad_pre_state_bak
,
grad_pre_state_bak
->
dims
().
size
()
-
1
);
eigen_grad_pre_state
.
device
(
place
)
=
(
1
-
eigen_mask_broadcast
)
*
eigen_grad_pre_state_bak
+
eigen_grad_pre_state
*
eigen_mask_broadcast
;
}
}
static_cast
<
T
>
(
1.0
),
grad_pre_hidden
,
0
);
postprocess_pre_hidden_grad
(
context
,
grad_pre_hidden
,
grad_pre_hidden_bak
,
grad_pre_state
,
grad_pre_state_bak
,
mask_tensor
,
has_sequence_length
);
}
virtual
void
update_weight_hh_grad
(
const
framework
::
ExecutionContext
&
context
,
Tensor
*
grad_gate
,
Tensor
*
pre_hidden
,
Tensor
*
grad_weight_hh
,
Tensor
*
grad_gate_buf
)
const
{
Tensor
*
grad_weight_hh
)
const
{
auto
&
device_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
device_ctx
);
...
...
@@ -1667,11 +1605,7 @@ struct GradCell {
auto
mat_dim_d
=
math
::
CreateMatrixDescriptor
(
pre_hidden
->
dims
(),
0
,
false
);
mat_dim_d
.
height_
*=
mat_dim_d
.
batch_size_
;
mat_dim_d
.
batch_size_
=
0
;
Tensor
*
grad_gate_tmp
=
grad_gate
;
if
(
is_gru
(
context
))
{
grad_gate_tmp
=
grad_gate_buf
;
}
blas
.
MatMul
(
*
grad_gate_tmp
,
mat_dim_c
,
*
pre_hidden
,
mat_dim_d
,
blas
.
MatMul
(
*
grad_gate
,
mat_dim_c
,
*
pre_hidden
,
mat_dim_d
,
static_cast
<
T
>
(
1.0
),
grad_weight_hh
,
static_cast
<
T
>
(
1.0
));
}
};
...
...
@@ -1685,8 +1619,7 @@ struct SimpleRNNGradCell : GradCell<T> {
Tensor
*
pre_state
,
Tensor
*
grad_hidden
,
Tensor
*
grad_state
,
Tensor
*
grad_gate
,
Tensor
*
grad_weight_hh
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_gate_buf
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
override
{
auto
&
device_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
...
...
@@ -1711,11 +1644,10 @@ struct SimpleRNNGradCell : GradCell<T> {
functor
(
*
place
,
z
,
h
,
dh
,
dz
);
// update grad_weight_hh, grad_pre_hidden
this
->
update_pre_hidden_grad
(
context
,
grad_gate
,
weight_hh
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
nullptr
,
nullptr
,
grad_gate_buf
,
mask_tensor
,
has_sequence_length
);
this
->
update_weight_hh_grad
(
context
,
grad_gate
,
pre_hidden
,
grad_weight_hh
,
grad_gate_buf
);
this
->
update_pre_hidden_grad
(
context
,
grad_gate
,
weight_hh
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
nullptr
,
nullptr
,
mask_tensor
,
has_sequence_length
);
this
->
update_weight_hh_grad
(
context
,
grad_gate
,
pre_hidden
,
grad_weight_hh
);
}
};
...
...
@@ -1728,8 +1660,7 @@ struct GRUGradCell : GradCell<T> {
Tensor
*
pre_state
,
Tensor
*
grad_hidden
,
Tensor
*
grad_state
,
Tensor
*
grad_gate
,
Tensor
*
grad_weight_hh
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_gate_buf
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
override
{
auto
&
device_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
...
...
@@ -1747,6 +1678,8 @@ struct GRUGradCell : GradCell<T> {
gru_value
.
gate_value
=
gate_tensor
->
data
<
T
>
();
gru_value
.
prev_out_value
=
pre_hidden
->
data
<
T
>
();
gru_value
.
reset_output_value
=
state_tensor
->
data
<
T
>
();
gru_value
.
state_weight
=
weight_hh
->
data
<
T
>
()
+
2
*
frame_size
*
frame_size
;
gru_value
.
gate_weight
=
weight_hh
->
data
<
T
>
();
gru_grad
.
gate_grad
=
grad_gate
->
data
<
T
>
();
gru_grad
.
reset_output_grad
=
grad_state
->
data
<
T
>
();
...
...
@@ -1755,7 +1688,7 @@ struct GRUGradCell : GradCell<T> {
gru_grad
.
gate_weight_grad
=
grad_weight_hh
->
data
<
T
>
();
gru_grad
.
state_weight_grad
=
grad_weight_hh
->
data
<
T
>
()
+
2
*
frame_size
*
frame_size
;
gru_grad
.
state_bias_grad
=
grad_bias_hh
->
data
<
T
>
()
+
2
*
frame_size
;
gru_grad
.
bias_hh_grad
=
grad_bias_hh
->
data
<
T
>
()
;
auto
act_gate
=
math
::
detail
::
GetActivationType
(
"sigmoid_v2"
);
auto
act_node
=
math
::
detail
::
GetActivationType
(
"tanh_v2"
);
...
...
@@ -1763,13 +1696,9 @@ struct GRUGradCell : GradCell<T> {
device_ctx
,
gru_value
,
gru_grad
,
frame_size
,
batch_size
,
act_node
,
act_gate
);
make_grad_gate_buf
<
T
>
(
context
,
grad_gate
,
grad_gate_buf
,
grad_state
);
this
->
update_pre_hidden_grad
(
context
,
grad_gate
,
weight_hh
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
nullptr
,
nullptr
,
grad_gate_buf
,
mask_tensor
,
has_sequence_length
);
this
->
update_weight_hh_grad
(
context
,
grad_gate
,
pre_hidden
,
grad_weight_hh
,
grad_gate_buf
);
this
->
postprocess_pre_hidden_grad
(
context
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
nullptr
,
nullptr
,
mask_tensor
,
has_sequence_length
);
}
};
...
...
@@ -1782,8 +1711,7 @@ struct LSTMGradCell : GradCell<T> {
Tensor
*
pre_state
,
Tensor
*
grad_hidden
,
Tensor
*
grad_state
,
Tensor
*
grad_gate
,
Tensor
*
grad_weight_hh
,
Tensor
*
grad_pre_hidden
,
Tensor
*
grad_pre_state
,
Tensor
*
grad_gate_buf
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
Tensor
*
grad_bias_hh
,
const
Tensor
&
mask_tensor
,
bool
has_sequence_length
)
const
override
{
auto
&
device_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
...
...
@@ -1822,12 +1750,10 @@ struct LSTMGradCell : GradCell<T> {
math
::
LstmUnitGradFunctor
<
platform
::
CPUDeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
batch_size
,
cell_clip
,
gate_act
,
state_act
,
cand_act
,
false
);
this
->
update_pre_hidden_grad
(
context
,
grad_gate
,
weight_hh
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
grad_pre_state
,
&
grad_pre_state_bak
,
grad_gate_buf
,
mask_tensor
,
has_sequence_length
);
this
->
update_weight_hh_grad
(
context
,
grad_gate
,
pre_hidden
,
grad_weight_hh
,
grad_gate_buf
);
this
->
update_pre_hidden_grad
(
context
,
grad_gate
,
weight_hh
,
grad_pre_hidden
,
&
grad_pre_hidden_bak
,
grad_pre_state
,
&
grad_pre_state_bak
,
mask_tensor
,
has_sequence_length
);
this
->
update_weight_hh_grad
(
context
,
grad_gate
,
pre_hidden
,
grad_weight_hh
);
}
};
...
...
@@ -2001,7 +1927,12 @@ void RnnGradFunc(const framework::ExecutionContext& context,
for
(
int
i
=
num_layers
-
1
;
i
>=
0
;
--
i
)
{
// the layer input output had saved, just use the data
if
(
i
>
0
)
{
layer_input
.
ShareDataWith
(
hidden_tensor_unbind
[
i
-
1
]);
if
(
layer_input
.
numel
()
==
0
)
{
layer_input
.
Resize
(
hidden_tensor_unbind
[
i
-
1
].
dims
());
layer_input
.
mutable_data
<
T
>
(
context
.
GetPlace
());
}
dropout_helper
<
T
>
(
context
,
&
hidden_tensor_unbind
[
i
-
1
],
&
layer_input
,
dropout_state
,
dropout_prob
);
}
else
{
layer_input
.
ShareDataWith
(
*
input
);
}
...
...
python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
浏览文件 @
085260f3
...
...
@@ -294,7 +294,6 @@ def unstack(array, axis=0):
def
dropout
(
array
,
p
=
0.5
):
if
p
==
0.0
:
return
array
mask
=
(
np
.
random
.
uniform
(
size
=
array
.
shape
)
<
(
1
-
p
)).
astype
(
array
.
dtype
)
return
array
*
(
mask
/
(
1
-
p
))
...
...
@@ -390,11 +389,12 @@ class RNNMixin(LayerListMixin):
states
=
split_states
(
initial_states
,
self
.
num_directions
==
2
,
self
.
state_components
)
final_states
=
[]
input_temp
=
inputs
for
i
,
rnn_layer
in
enumerate
(
self
):
if
i
>
0
:
inputs
=
dropout
(
inputs
,
self
.
dropout
)
outputs
,
final_state
=
rnn_layer
(
inputs
,
states
[
i
],
sequence_length
)
input_temp
=
dropout
(
inputs
,
self
.
dropout
)
outputs
,
final_state
=
rnn_layer
(
input_temp
,
states
[
i
],
sequence_length
)
final_states
.
append
(
final_state
)
inputs
=
outputs
...
...
python/paddle/fluid/tests/unittests/test_rnn_op.py
浏览文件 @
085260f3
...
...
@@ -53,6 +53,7 @@ class TestRNNOp(OpTest):
self
.
is_bidirec
=
False
self
.
mode
=
"LSTM"
self
.
is_test
=
False
self
.
dropout
=
0.0
self
.
set_attrs
()
self
.
direction_num
=
2
if
self
.
is_bidirec
else
1
...
...
@@ -76,7 +77,8 @@ class TestRNNOp(OpTest):
hidden_size
,
num_layers
=
self
.
num_layers
,
time_major
=
True
,
direction
=
direction
)
direction
=
direction
,
dropout
=
self
.
dropout
)
flat_w
=
get_params_for_net
(
rnn1
)
output
,
(
last_hidden
,
last_cell
)
=
rnn1
(
...
...
@@ -101,7 +103,7 @@ class TestRNNOp(OpTest):
'PreState'
:
[(
'init_h'
,
init_h
),
(
'init_c'
,
init_c
)],
}
self
.
attrs
=
{
'dropout_prob'
:
0.0
,
'dropout_prob'
:
self
.
dropout
,
'is_bidirec'
:
self
.
is_bidirec
,
'input_size'
:
input_size
,
'hidden_size'
:
hidden_size
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录