Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
cf5ea925
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cf5ea925
编写于
8月 22, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bugs
上级
6ed20474
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
75 addition
and
68 deletion
+75
-68
paddle/fluid/operators/attention_lstm_op.cc
paddle/fluid/operators/attention_lstm_op.cc
+58
-65
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+16
-1
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+1
-2
未找到文件。
paddle/fluid/operators/attention_lstm_op.cc
浏览文件 @
cf5ea925
...
...
@@ -15,12 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
// #include "paddle/fluid/operators/math/detail/activation_functions.h"
// #include "paddle/fluid/operators/math/cpu_vec.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -233,6 +230,13 @@ use lstm_x_t as input and compute as standard LSTM.
)DOC"
);
}
template
<
typename
T
>
inline
void
vec_relu
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
template
<
typename
T
>
inline
void
bias_relu
(
const
int
n
,
const
T
*
x
,
const
T
*
bias
,
T
*
y
)
{
...
...
@@ -240,14 +244,14 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
bias
[
0
];
}
vec_relu
(
n
,
y
,
y
);
vec_relu
<
T
>
(
n
,
y
,
y
);
}
else
{
vec_relu
(
n
,
x
,
y
);
vec_relu
<
T
>
(
n
,
x
,
y
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
vec_softmax
(
const
BlasT
<
DeviceContext
,
T
>&
blas
,
const
int
n
,
inline
void
vec_softmax
(
const
math
::
BlasT
<
DeviceContext
,
T
>&
blas
,
const
int
n
,
const
T
*
x
,
T
*
y
)
{
T
scalar
=
x
[
0
];
// max
...
...
@@ -257,7 +261,7 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
// sub
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
c
]
=
x
[
c
]
-
alpha
;
y
[
i
]
=
x
[
i
]
-
scalar
;
}
// exp
...
...
@@ -270,57 +274,45 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
}
// scale
blas
.
V
SCAL
(
n
,
static_cast
<
T
>
(
1
)
/
scalar
,
y
);
blas
.
SCAL
(
n
,
static_cast
<
T
>
(
1
)
/
scalar
,
y
);
}
__m256
exp
(
__m256
a
)
{
return
exp256_ps
(
a
);
}
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
__m256
log
(
__m256
a
)
{
return
log256_ps
(
a
);
}
__m256
sin
(
__m256
a
)
{
return
sin256_ps
(
a
);
}
__m256
cos
(
__m256
a
)
{
return
cos256_ps
(
a
);
}
__m256
relu
(
const
__m256
a
)
{
__m256
tmp
=
_mm256_set1_ps
(
0.0
f
);
return
_mm256_max_ps
(
a
,
tmp
);
template
<
typename
T
>
inline
T
sigmoid
(
T
x
)
{
return
1.
/
(
1.
+
exp
(
-
x
));
}
__m256
sigmoid
(
const
__m256
a
)
{
__m256
max
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MAX
);
__m256
min
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MIN
);
__m256
tmp
=
_mm256_max_ps
(
a
,
min
);
tmp
=
_mm256_min_ps
(
tmp
,
max
);
tmp
=
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
f
),
tmp
);
tmp
=
exp
(
tmp
);
tmp
=
_mm256_add_ps
(
_mm256_set1_ps
(
1.0
f
),
tmp
);
tmp
=
_mm256_div_ps
(
_mm256_set1_ps
(
1.0
f
),
tmp
);
return
tmp
;
template
<
typename
T
>
inline
T
tanh
(
T
x
)
{
return
2.
*
sigmoid
(
2.
*
x
)
-
1.
;
}
__m256
tanh
(
const
__m256
a
)
{
__m256
max
=
_mm256_set1_ps
(
EXP_MAX_INPUT
);
__m256
tmp
=
_mm256_mul_ps
(
_mm256_set1_ps
(
-
2.0
f
),
a
)
;
tmp
=
_mm256_min_ps
(
tmp
,
max
)
;
tmp
=
exp
(
tmp
);
return
_mm256_sub_ps
(
_mm256_div_ps
(
_mm256_set1_ps
(
2.0
f
),
_mm256_add_ps
(
_mm256_set1_ps
(
1.0
f
),
tmp
)),
_mm256_set1_ps
(
1.0
f
));
template
<
typename
T
>
inline
void
vec_sigmoid
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
T
tmp
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
1.0
/
(
1.0
+
std
::
exp
(
-
tmp
));
}
}
__m256
linear
(
const
__m256
a
)
{
return
a
;
}
inline
void
vec_sigmoid
(
const
T
*
x
,
T
*
y
)
{
const
real
min
=
SIGMOID_THRESHOLD_MIN
;
const
real
max
=
SIGMOID_THRESHOLD_MAX
;
real
tmp
=
(
a
<
min
)
?
min
:
((
a
>
max
)
?
max
:
a
);
return
1.0
/
(
1.0
+
exp
(
-
tmp
));
template
<
typename
T
>
inline
void
vec_tanh
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
tanh
<
T
>
(
x
[
i
]);
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
class
AttentionLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
// T x M
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
// N x D
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
// N x D
...
...
@@ -334,7 +326,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
// TxD
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
// TxD
auto
*
atted_x
=
ctx
.
Output
<
Tensor
>
(
"AttentionedX"
);
// T x 1
auto
*
fc_out
=
ctx
.
Output
<
Tensor
>
(
'
AttentionFCOut
'
);
// max_seq_len x 1
auto
*
fc_out
=
ctx
.
Output
<
Tensor
>
(
"AttentionFCOut"
);
// max_seq_len x 1
auto
*
lstm_x
=
ctx
.
Output
<
Tensor
>
(
"LSTMX"
);
// 1 x M
auto
*
lstm_out
=
ctx
.
Output
<
Tensor
>
(
"LSTMOUT"
);
// 1 x 4D
...
...
@@ -342,9 +334,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto
x_lod
=
x
->
lod
();
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
// batch size
auto
x_dims
=
x
->
dims
();
// T x M
auto
w_dims
=
w
->
dims
();
// (D+M) x 4D
const
int
M
=
x_dims
[
1
];
// x frame size
const
int
D
=
w_dims
[
1
]
/
4
;
// gate frame size
auto
w_dims
=
lstm_w
->
dims
();
// (D+M) x 4D
const
int
total_T
=
x_dims
[
0
];
const
int
M
=
x_dims
[
1
];
// x frame size
const
int
D
=
w_dims
[
1
]
/
4
;
// gate frame size
const
int
D2
=
D
*
2
;
const
int
D3
=
D
*
3
;
const
int
D4
=
w_dims
[
1
];
...
...
@@ -357,6 +350,8 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
c0
->
dims
()[
0
],
N
,
"C0 dims should be %d x %d."
,
N
,
D
);
fc_out
->
Resize
({
max_seq_len
,
1
});
// TODO(TJ): act functor init here
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
h0_data
=
h0
->
data
<
T
>
();
const
T
*
c0_data
=
c0
->
data
<
T
>
();
...
...
@@ -368,16 +363,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
const
T
*
atten_scalar_bias_data
=
atten_scalar_bias
?
atten_scalar_bias
->
data
<
T
>
()
:
NULL
;
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
();
T
*
cell_out_data
=
cell_out
->
mutable_data
<
T
>
();
T
*
atted_x_data
=
atted_x
->
mutable_data
<
T
>
();
T
*
fc_out_data
=
fc_out
->
mutable_data
<
T
>
();
T
*
lstm_x_data
=
lstm_x
->
mutable_data
<
T
>
();
T
*
lstm_out_data
=
lstm_out
->
mutable_data
<
T
>
();
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
cell_out_data
=
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
atted_x_data
=
atted_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
fc_out_data
=
fc_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
lstm_x_data
=
lstm_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
lstm_out_data
=
lstm_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
T
,
1
,
M
,
x_data
,
atten_w_data
,
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_
T
,
1
,
M
,
x_data
,
atten_w_data
,
atted_x_data
,
atten_b_data
);
const
T
*
cur_x_data
=
x_data
;
...
...
@@ -400,7 +395,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
// fc2: scalar
if
(
atten_scalar_data
)
{
// x = a*x
blas
.
SCAL
(
seq_len
,
atten_scalar_data
,
fc_out_data
);
blas
.
SCAL
(
seq_len
,
*
atten_scalar_data
,
fc_out_data
);
bias_relu
<
T
>
(
seq_len
,
fc_out_data
,
atten_scalar_bias_data
,
fc_out_data
);
}
...
...
@@ -431,16 +426,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
blas
.
VMUL
(
D
,
lstm_out_data
,
prev_cell_data
,
lstm_out_data
);
// b = input * tilde
blas
.
VMUL
(
D
,
lstm_out_data
+
D
,
lstm_out
+
D3
,
lstm_out_data
+
D
);
blas
.
VMUL
(
D
,
lstm_out_data
+
D
,
lstm_out
_data
+
D3
,
lstm_out_data
+
D
);
// cell_out = a + b
blas
.
VADD
(
D
,
lstm_out_data
,
lstm_out_data
+
D
,
cur_cell_out_data
);
// state act tanh(cell_out) * output_gate
vec_tanh
(
D
,
cur_cell_out_data
,
lstm_out_data
);
blas
.
VMUL
(
D
,
lstm_out_data
,
lstm_out
+
D2
,
cur_hidden_out_data
);
blas
.
VMUL
(
D
,
lstm_out_data
,
lstm_out
_data
+
D2
,
cur_hidden_out_data
);
prev_hidden_data
=
hidden_out
+
i
*
gate_size
;
prev_hidden_data
=
cur_hidden_out_data
;
prev_cell_data
=
cur_cell_out_data
;
cur_cell_out_data
=
cur_cell_out_data
+
D
;
cur_hidden_out_data
=
cur_hidden_out_data
+
D
;
...
...
@@ -458,7 +453,5 @@ REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
ops
::
AttentionLSTMOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
attention_lstm
,
ops
::
AttentionLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AttentionLSTMKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
attention_lstm
,
ops
::
AttentionLSTMKernel
<
float
>
,
ops
::
AttentionLSTMKernel
<
double
>
);
paddle/fluid/operators/math/blas.h
浏览文件 @
cf5ea925
...
...
@@ -160,7 +160,7 @@ class Blas {
T
DOT
(
int
n
,
const
T
*
x
,
const
T
*
y
)
const
;
template
<
typename
T
>
void
SCAL
(
int
n
,
const
T
a
,
const
T
*
x
)
const
;
void
SCAL
(
int
n
,
const
T
a
,
T
*
x
)
const
;
template
<
typename
T
>
void
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
...
...
@@ -233,11 +233,26 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
VCOPY
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
VEXP
(
ARGS
...
args
)
const
{
Base
()
->
template
VEXP
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMV
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMV
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
T
DOT
(
ARGS
...
args
)
const
{
return
Base
()
->
template
DOT
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
SCAL
(
ARGS
...
args
)
const
{
Base
()
->
template
SCAL
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
BatchedGEMM
(
ARGS
...
args
)
const
{
Base
()
->
template
BatchedGEMM
<
T
>(
args
...);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
cf5ea925
...
...
@@ -415,8 +415,7 @@ T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
SCAL
(
int
n
,
const
T
a
,
const
T
*
x
)
const
{
void
Blas
<
platform
::
CPUDeviceContext
>::
SCAL
(
int
n
,
const
T
a
,
T
*
x
)
const
{
#ifdef PADDLE_WITH_MKLML
CBlas
<
T
>::
SCAL
(
n
,
a
,
x
,
1
);
#else
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录