Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2a8dbd13
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2a8dbd13
编写于
10月 17, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
LSTM Operator forward implementation.
上级
83627d3e
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
730 addition
and
301 deletion
+730
-301
paddle/framework/CMakeLists.txt
paddle/framework/CMakeLists.txt
+2
-2
paddle/operators/CMakeLists.txt
paddle/operators/CMakeLists.txt
+3
-1
paddle/operators/lstm_op.cc
paddle/operators/lstm_op.cc
+24
-17
paddle/operators/lstm_op.h
paddle/operators/lstm_op.h
+99
-9
paddle/operators/math/CMakeLists.txt
paddle/operators/math/CMakeLists.txt
+4
-1
paddle/operators/math/detail/hl_activation_functions.h
paddle/operators/math/detail/hl_activation_functions.h
+135
-11
paddle/operators/math/detail/hl_cpu_functions.cc
paddle/operators/math/detail/hl_cpu_functions.cc
+0
-44
paddle/operators/math/detail/hl_functions.h
paddle/operators/math/detail/hl_functions.h
+79
-16
paddle/operators/math/detail/hl_gpu_functions.h
paddle/operators/math/detail/hl_gpu_functions.h
+37
-28
paddle/operators/math/detail/lstm_cpu_kernel.h
paddle/operators/math/detail/lstm_cpu_kernel.h
+25
-21
paddle/operators/math/detail/lstm_gpu_kernel.h
paddle/operators/math/detail/lstm_gpu_kernel.h
+45
-29
paddle/operators/math/detail/lstm_kernel.h
paddle/operators/math/detail/lstm_kernel.h
+14
-15
paddle/operators/math/lstm_compute.cc
paddle/operators/math/lstm_compute.cc
+29
-23
paddle/operators/math/lstm_compute.cu
paddle/operators/math/lstm_compute.cu
+35
-28
paddle/operators/math/lstm_compute.h
paddle/operators/math/lstm_compute.h
+27
-24
paddle/operators/math/sequence2batch.cc
paddle/operators/math/sequence2batch.cc
+9
-5
paddle/operators/math/sequence2batch.cu
paddle/operators/math/sequence2batch.cu
+15
-10
paddle/operators/math/sequence2batch.h
paddle/operators/math/sequence2batch.h
+32
-17
python/paddle/v2/framework/tests/test_lstm_op.py
python/paddle/v2/framework/tests/test_lstm_op.py
+116
-0
未找到文件。
paddle/framework/CMakeLists.txt
浏览文件 @
2a8dbd13
...
...
@@ -46,9 +46,9 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
set
(
EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op
mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op mean_op
)
if
(
WITH_GPU
)
nv_test
(
executor_test SRCS executor_test.cc DEPS executor
${
EXECUTOR_TEST_OP
}
)
#
nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
else
()
cc_test
(
executor_test SRCS executor_test.cc DEPS executor
${
EXECUTOR_TEST_OP
}
)
#
cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
endif
()
cc_library
(
tensor_array SRCS tensor_array.cc DEPS lod_tensor
)
...
...
paddle/operators/CMakeLists.txt
浏览文件 @
2a8dbd13
...
...
@@ -115,7 +115,8 @@ set(DEPS_OPS
softmax_with_cross_entropy_op
sum_op
pool_op
pool_with_index_op
)
pool_with_index_op
lstm_op
)
op_library
(
recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
...
...
@@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library
(
sum_op DEPS net_op
)
op_library
(
pool_op DEPS pooling
)
op_library
(
pool_with_index_op DEPS pooling
)
op_library
(
lstm_op DEPS sequence2batch
)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
foreach
(
src
${
GENERAL_OPS
}
)
...
...
paddle/operators/lstm_op.cc
浏览文件 @
2a8dbd13
...
...
@@ -22,12 +22,12 @@ class LSTMOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
Base
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
"Output(Hidden) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
H
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
Cell
"
),
"Output(Cell) of LSTM should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"Input"
);
...
...
@@ -60,7 +60,7 @@ class LSTMOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ
(
b_dims
.
size
(),
2
,
"The rank of Input(Bias) should be 2."
);
PADDLE_ENFORCE_EQ
(
b_dims
[
0
],
1
,
"The first dimension of Input(Bias) should be 1."
);
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use
_p
eepholes"
))
{
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use
P
eepholes"
))
{
PADDLE_ENFORCE_EQ
(
b_dims
[
1
],
7
*
frame_size
,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection"
,
...
...
@@ -73,7 +73,7 @@ class LSTMOp : public framework::OperatorWithKernel {
}
ctx
->
SetOutputDim
(
"Hidden"
,
x_dims
);
ctx
->
SetOutputDim
(
"Cell"
,
x_dims
);
ctx
->
SetOutputDim
(
"
Hidden
"
,
x_dims
);
ctx
->
SetOutputDim
(
"
Batch
"
,
x_dims
);
ctx
->
ShareLoD
(
"Input"
,
"Hidden"
);
ctx
->
ShareLoD
(
"Input"
,
"Cell"
);
}
...
...
@@ -86,7 +86,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Input"
,
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTenosr is a matrix with shape (T X D), where, T is the "
"this LoDTenosr is a matrix with shape (T X
4
D), where, T is the "
"total time steps in this mini-batch, D is the hidden size."
);
AddInput
(
"H0"
,
"(Tensor, optional) the initial hidden state is an optional "
...
...
@@ -103,14 +103,21 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Bias"
,
"(Tensor) the learnable weights, which contains two parts: "
"input-hidden bias weight and peephole connections weight if "
"seting `use
_p
eepholes` True. "
"1. `use
_p
eepholes = False` "
"seting `use
P
eepholes` True. "
"1. `use
P
eepholes = False` "
" - The shape is (1 x 4*D). "
" - Bias = {b_i, b_f, b_c, b_o}."
"2. `use
_p
eepholes = True` "
"2. `use
P
eepholes = True` "
" - The shape is (1 x 7*D). "
" - Bias = {b_i, b_f, b_c, b_o, W_ic, W_fc, W_oc}."
);
AddOutput
(
"Batch"
,
"(LoDTensor) save the reorganized input as batch info. "
)
AddOutput
(
"BatchGate"
,
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate aftern the nonlinear computation. This "
"LoDTensor has the same shape with the reorganized input, which "
"was also be called batch input. The LoD size is 2. The first "
"LoD is the batch offsets and the second LoD contains the "
"indexes, which denote the position of reorganized sequence "
"in the raw input."
)
.
AsIntermediate
();
AddOutput
(
"Hidden"
,
"(LoDTensor) the hidden state lod tensor of LSTM operator. "
...
...
@@ -118,25 +125,25 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Cell"
,
"(LoDTensor) the cell state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`."
);
AddAttr
<
bool
>
(
"use
_p
eepholes"
,
AddAttr
<
bool
>
(
"use
P
eepholes"
,
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"is
_r
everse"
,
AddAttr
<
bool
>
(
"is
R
everse"
,
"(bool, defalut: False) "
"whether to compute reversed LSTM."
)
.
SetDefault
(
tru
e
);
.
SetDefault
(
fals
e
);
AddAttr
<
std
::
string
>
(
"gate
_a
ctivation"
,
"gate
A
ctivation"
,
"(string, defalut: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by defalut."
)
.
SetDefault
(
"sigmoid"
);
AddAttr
<
std
::
string
>
(
"cell
_a
ctivation"
,
AddAttr
<
std
::
string
>
(
"cell
A
ctivation"
,
"(string, defalut: tanh)"
"The activation for cell output, `tanh` by defalut."
)
.
SetDefault
(
"tanh"
);
AddAttr
<
std
::
string
>
(
"candidate
_a
ctivation"
,
AddAttr
<
std
::
string
>
(
"candidate
A
ctivation"
,
"(string, defalut: tanh)"
"The activation for candidate hidden state, "
"`tanh` by defalut."
)
...
...
@@ -173,7 +180,7 @@ are the cell input and cell output activation functions, `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Set `use
_p
eepholes` False to disable peephole connection [2]. The formula
Set `use
P
eepholes` False to disable peephole connection [2]. The formula
is omitted here.
@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
...
...
@@ -196,7 +203,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
Base
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Hidden"
)),
"Input(Hidden@GRAD) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Cell"
)),
...
...
paddle/operators/lstm_op.h
浏览文件 @
2a8dbd13
...
...
@@ -14,30 +14,120 @@ limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
LoDTensor
;
using
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
Place
,
typename
T
>
class
LSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Input"
);
auto
*
batch_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Batch"
);
auto
*
bias_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Bias"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
LoDTensor2BatchFunctor
<
Place
,
T
>
to_batch
(
ctx
.
device_context
(),
input_t
,
batch_t
,
is_reverse
);
auto
in_dims
=
input_t
->
dims
();
auto
*
input
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Input"
);
auto
*
weight
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Weight"
);
auto
*
bias
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
hidden_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Hidden"
);
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
cell_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Cell"
);
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Now the function ShareLoD in InferShape is not implemented.
// So copy LoD here.
ctx
.
ShareLoD
(
"Input"
,
"Hidden"
);
ctx
.
ShareLoD
(
"Input"
,
"Cell"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"isReverse"
);
math
::
LoDTensor2BatchFunctor
<
Place
,
T
>
to_batch
;
to_batch
(
ctx
.
device_context
(),
*
input
,
*
batch_gate
,
is_reverse
);
auto
in_dims
=
input
->
dims
();
int
frame_size
=
in_dims
[
1
];
if
(
bias_t
)
{
if
(
bias
)
{
Eigen
::
array
<
int
,
2
>
extents
({{
1
,
4
*
frame_size
}});
Eigen
::
array
<
int
,
2
>
offsets
({{
0
,
0
}});
auto
b
=
EigenMatrix
<
T
>::
From
(
*
bias
);
auto
gate
=
EigenMatrix
<
T
>::
From
(
*
batch_gate
);
gate
.
device
(
ctx
.
GetEigenDevice
<
Place
>
())
=
gate
+
b
.
slice
(
offsets
,
extents
)
.
reshape
(
Eigen
::
array
<
int
,
2
>
({{
1
,
frame_size
*
4
}}))
.
broadcast
(
Eigen
::
array
<
int
,
2
>
({{
static_cast
<
int
>
(
in_dims
[
0
]),
1
}}));
}
math
::
LstmMetaValue
<
T
>
lstm_value
;
T
*
bias_data
=
const_cast
<
T
*>
(
bias
->
data
<
T
>
());
// the code styple in LstmMetaValue will be updated later.
lstm_value
.
checkIg
=
bias_data
+
4
*
frame_size
;
lstm_value
.
checkFg
=
lstm_value
.
checkIg
+
frame_size
;
lstm_value
.
checkOg
=
lstm_value
.
checkFg
+
frame_size
;
lstm_value
.
prevStateValue
=
nullptr
;
framework
::
LoDTensor
batch_out
;
batch_out
.
mutable_data
<
T
>
(
in_dims
,
ctx
.
GetPlace
());
framework
::
LoDTensor
batch_cell
;
batch_cell
.
mutable_data
<
T
>
(
in_dims
,
ctx
.
GetPlace
());
framework
::
LoDTensor
batch_cell_pre_act
;
batch_cell_pre_act
.
mutable_data
<
T
>
(
in_dims
,
ctx
.
GetPlace
());
auto
batch_lod
=
batch_gate
->
lod
()[
0
];
int
num_batch
=
batch_lod
.
size
()
-
1
;
auto
gate_act
=
ctx
.
Attr
<
std
::
string
>
(
"gateActivation"
);
auto
cell_act
=
ctx
.
Attr
<
std
::
string
>
(
"cellActivation"
);
auto
cand_act
=
ctx
.
Attr
<
std
::
string
>
(
"candidateActivation"
);
for
(
int
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
batch_lod
[
n
];
int
bend
=
batch_lod
[
n
+
1
];
Tensor
gate_t
=
batch_gate
->
Slice
<
T
>
(
bstart
,
bend
);
Tensor
out_t
=
batch_out
.
Slice
<
T
>
(
bstart
,
bend
);
Tensor
cell_t
=
batch_cell
.
Slice
<
T
>
(
bstart
,
bend
);
Tensor
cell_pre_act_t
=
batch_cell_pre_act
.
Slice
<
T
>
(
bstart
,
bend
);
int
cur_batch_size
=
bend
-
bstart
;
if
(
n
!=
0
)
{
int
pre_end
=
batch_lod
[
n
-
1
];
auto
pre_hidden_t
=
batch_out
.
Slice
<
T
>
(
pre_end
,
bstart
);
math
::
matmul
<
Place
,
T
>
(
ctx
.
device_context
(),
pre_hidden_t
,
false
,
*
weight
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
0.0
));
}
// else if : how to pass the state from
// last mini-batch will be supported later
lstm_value
.
gateValue
=
gate_t
.
data
<
T
>
();
lstm_value
.
outputValue
=
out_t
.
data
<
T
>
();
lstm_value
.
stateValue
=
cell_t
.
data
<
T
>
();
lstm_value
.
stateActiveValue
=
cell_pre_act_t
.
data
<
T
>
();
math
::
LstmUnitFunctor
<
Place
,
T
>::
compute
(
ctx
.
device_context
(),
lstm_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
lstm_value
.
prevStateValue
=
lstm_value
.
stateValue
;
}
math
::
Batch2LoDTensorFunctor
<
Place
,
T
>
to_seq
;
batch_out
.
set_lod
(
batch_gate
->
lod
());
// restore the output hidden in LoDTensor from the batch hidden
to_seq
(
ctx
.
device_context
(),
batch_out
,
*
hidden_out
);
batch_out
.
set_lod
(
batch_gate
->
lod
());
// restore the output cell state in LoDTensor from the batch cell
to_seq
(
ctx
.
device_context
(),
batch_cell
,
*
cell_out
);
}
};
...
...
paddle/operators/math/CMakeLists.txt
浏览文件 @
2a8dbd13
...
...
@@ -5,13 +5,16 @@ if(WITH_GPU)
nv_library
(
cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator
)
nv_library
(
pooling SRCS pooling.cc pooling.cu DEPS device_context
)
nv_library
(
vol2col SRCS vol2col.cc vol2col.cu DEPS device_context
)
nv_library
(
sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context
)
nv_library
(
lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context
)
else
()
cc_library
(
math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator
)
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function tensor
)
cc_library
(
softmax SRCS softmax.cc DEPS operator
)
cc_library
(
cross_entropy SRCS cross_entropy.cc DEPS operator
)
cc_library
(
pooling SRCS pooling.cc DEPS device_context
)
cc_library
(
vol2col SRCS vol2col.cc DEPS device_context
)
cc_library
(
sequence2batch SRCS sequence2batch.cc DEPS device_context
)
cc_library
(
lstm_compute SRCS lstm_compute.cc DEPS device_context
)
endif
()
cc_test
(
im2col_test SRCS im2col_test.cc DEPS math_function tensor
)
...
...
paddle/operators/math/detail/hl_activation_functions.h
浏览文件 @
2a8dbd13
...
...
@@ -16,15 +16,30 @@ limitations under the License. */
#define HL_ACTIVATION_FUNCTIONS_H_
#include "hl_functions.h"
#include "paddle/operators/math/lstm_compute.h"
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
#define HPPL_ACTIVE_FUNCTION \
#define FLOAT_ACTIVE_FUNCTION \
{ \
hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \
hppl::typef::linear \
}
#define DOUBLE_ACTIVE_FUNCTION \
{ \
hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \
hppl::typed::linear \
}
#define AVX_ACTIVE_FUNCTION \
{ hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace
hppl
{
using
activation_mode_t
=
paddle
::
operators
::
math
::
activation_mode_t
;
/**
* Hppl supports sigmoid, relu, tanh, linear active functions
* for neural networks' forward and backward activation.
...
...
@@ -36,25 +51,134 @@ class Active {
typedef
T
(
*
backward
)(
T
,
T
);
};
template
<
typename
T
>
struct
ForwardActType
;
template
<
>
struct
ForwardActType
<
float
>
{
using
type
=
Active
<
float
>::
forward
;
};
template
<
>
struct
ForwardActType
<
double
>
{
using
type
=
Active
<
double
>::
forward
;
};
template
<
typename
T
>
struct
BackwardActType
;
template
<
>
struct
BackwardActType
<
float
>
{
using
type
=
Active
<
float
>::
backward
;
};
template
<
>
struct
BackwardActType
<
double
>
{
using
type
=
Active
<
double
>::
backward
;
};
#ifdef __NVCC__
namespace
gpu
{
static
__device__
Active
<
float
>::
forward
forward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
__device__
Active
<
float
>::
backward
backward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
__device__
Active
<
double
>::
forward
forward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
__device__
Active
<
double
>::
backward
backward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
__device__
Active
<
float
>::
forward
forward
[]
=
FLOAT_ACTIVE_FUNCTION
;
static
__device__
Active
<
float
>::
backward
backward
[]
=
FLOAT_ACTIVE_FUNCTION
;
static
__device__
Active
<
double
>::
forward
forward_d
[]
=
DOUBLE_ACTIVE_FUNCTION
;
static
__device__
Active
<
double
>::
backward
backward_d
[]
=
DOUBLE_ACTIVE_FUNCTION
;
template
<
typename
T
>
struct
ForwardAct
{
__device__
typename
ForwardActType
<
T
>::
type
operator
()(
activation_mode_t
type
);
};
template
<
>
struct
ForwardAct
<
float
>
{
__device__
ForwardActType
<
float
>::
type
operator
()(
activation_mode_t
type
)
{
return
forward
[
type
];
}
};
template
<
>
struct
ForwardAct
<
double
>
{
__device__
ForwardActType
<
double
>::
type
operator
()(
activation_mode_t
type
)
{
return
forward_d
[
type
];
}
};
template
<
typename
T
>
struct
BackwardAct
{
__device__
typename
BackwardActType
<
T
>::
type
operator
()(
activation_mode_t
type
);
};
template
<
>
struct
BackwardAct
<
float
>
{
__device__
BackwardActType
<
float
>::
type
operator
()(
activation_mode_t
type
)
{
return
backward
[
type
];
}
};
template
<
>
struct
BackwardAct
<
double
>
{
__device__
BackwardActType
<
double
>::
type
operator
()(
activation_mode_t
type
)
{
return
backward_d
[
type
];
}
};
}
// namespace gpu
#else
namespace
cpu
{
static
Active
<
float
>::
forward
forward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
Active
<
float
>::
backward
backward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
Active
<
double
>::
forward
forward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
Active
<
double
>::
backward
backward
[]
=
HPPL_ACTIVE_FUNCTION
;
static
Active
<
float
>::
forward
forward
[]
=
FLOAT_ACTIVE_FUNCTION
;
static
Active
<
float
>::
backward
backward
[]
=
FLOAT_ACTIVE_FUNCTION
;
static
Active
<
double
>::
forward
forward_d
[]
=
DOUBLE_ACTIVE_FUNCTION
;
static
Active
<
double
>::
backward
backward_d
[]
=
DOUBLE_ACTIVE_FUNCTION
;
template
<
typename
T
>
struct
ForwardAct
{
typename
ForwardActType
<
T
>::
type
operator
()(
activation_mode_t
type
);
};
template
<
>
struct
ForwardAct
<
float
>
{
ForwardActType
<
float
>::
type
operator
()(
activation_mode_t
type
)
{
return
forward
[
type
];
}
};
template
<
>
struct
ForwardAct
<
double
>
{
ForwardActType
<
double
>::
type
operator
()(
activation_mode_t
type
)
{
return
forward_d
[
type
];
}
};
template
<
typename
T
>
struct
BackwardAct
{
typename
BackwardActType
<
T
>::
type
operator
()(
activation_mode_t
type
);
};
template
<
>
struct
BackwardAct
<
float
>
{
BackwardActType
<
float
>::
type
operator
()(
activation_mode_t
type
)
{
return
backward
[
type
];
}
};
template
<
>
struct
BackwardAct
<
double
>
{
BackwardActType
<
double
>::
type
operator
()(
activation_mode_t
type
)
{
return
backward_d
[
type
];
}
};
}
// namespace cpu
#ifdef __AVX__
namespace
avx
{
static
Active
<
__m256
>::
forward
forward
[]
=
HPPL
_ACTIVE_FUNCTION
;
static
Active
<
__m256
>::
backward
backward
[]
=
HPPL
_ACTIVE_FUNCTION
;
static
Active
<
__m256
>::
forward
forward
[]
=
AVX
_ACTIVE_FUNCTION
;
static
Active
<
__m256
>::
backward
backward
[]
=
AVX
_ACTIVE_FUNCTION
;
}
// namespace avx
#endif
#endif
...
...
paddle/operators/math/detail/hl_cpu_functions.cc
已删除
100644 → 0
浏览文件 @
83627d3e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "/paddle/operators/math/detail/hl_functions.h"
namespace
hppl
{
real
relu
(
const
real
a
)
{
return
a
>
0.0
f
?
a
:
0.0
f
;
}
real
sigmoid
(
const
real
a
)
{
const
real
min
=
SIGMOID_THRESHOLD_MIN
;
const
real
max
=
SIGMOID_THRESHOLD_MAX
;
real
tmp
=
(
a
<
min
)
?
min
:
((
a
>
max
)
?
max
:
a
);
return
1.0
/
(
1.0
+
exp
(
-
tmp
));
}
real
tanh
(
const
real
a
)
{
real
tmp
=
-
2.0
*
a
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
return
(
2.0
/
(
1.0
+
exp
(
tmp
)))
-
1.0
;
}
real
linear
(
const
real
a
)
{
return
a
;
}
real
relu
(
const
real
a
,
const
real
b
)
{
return
a
*
(
b
>
0.0
f
?
1.0
f
:
0.0
f
);
}
real
sigmoid
(
const
real
a
,
const
real
b
)
{
return
a
*
b
*
(
1
-
b
);
}
real
tanh
(
const
real
a
,
const
real
b
)
{
return
a
*
(
1.0
f
-
b
*
b
);
}
real
linear
(
const
real
a
,
const
real
b
)
{
return
a
;
}
}
// namespace hppl
paddle/operators/math/detail/hl_functions.h
浏览文件 @
2a8dbd13
...
...
@@ -25,31 +25,94 @@ limitations under the License. */
*/
#define SIGMOID_THRESHOLD_MAX 13.0
/**
* The maximum input value for exp, used to avoid overflow problem.
* currently only used for tanh function.
*/
#define EXP_MAX_INPUT 40.0
#ifndef __NVCC__
namespace
hppl
{
namespace
typef
{
/*
* forward activation
*/
float
relu
(
const
float
a
)
{
return
a
>
static_cast
<
float
>
(
0.0
)
?
a
:
static_cast
<
float
>
(
0.0
);
}
float
sigmoid
(
const
float
a
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
float
tmp
=
(
a
<
min
)
?
min
:
((
a
>
max
)
?
max
:
a
);
return
static_cast
<
float
>
(
1.0
)
/
(
static_cast
<
float
>
(
1.0
)
+
exp
(
-
tmp
));
}
float
tanh
(
const
float
a
)
{
float
tmp
=
-
2.0
*
a
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
return
(
2.0
/
(
1.0
+
exp
(
tmp
)))
-
1.0
;
}
float
linear
(
const
float
a
)
{
return
a
;
}
/*
* backward activation
*/
float
relu
(
const
float
a
,
const
float
b
)
{
return
a
*
(
b
>
0.0
?
1.0
:
0.0
);
}
float
sigmoid
(
const
float
a
,
const
float
b
)
{
return
a
*
b
*
(
static_cast
<
float
>
(
1
)
-
b
);
}
float
tanh
(
const
float
a
,
const
float
b
)
{
return
a
*
(
static_cast
<
float
>
(
1
)
-
b
*
b
);
}
float
linear
(
const
float
a
,
const
float
b
)
{
return
a
;
}
}
// namespace typef
namespace
typed
{
/*
* forward activation
*/
template
<
typename
T
>
T
relu
(
const
T
a
);
template
<
typename
T
>
T
sigmoid
(
const
T
a
);
template
<
typename
T
>
T
tanh
(
const
T
a
);
template
<
typename
T
>
T
linear
(
const
T
a
);
double
relu
(
const
double
a
)
{
return
a
>
static_cast
<
double
>
(
0.0
)
?
a
:
static_cast
<
double
>
(
0.0
);
}
double
sigmoid
(
const
double
a
)
{
const
double
min
=
SIGMOID_THRESHOLD_MIN
;
const
double
max
=
SIGMOID_THRESHOLD_MAX
;
double
tmp
=
(
a
<
min
)
?
min
:
((
a
>
max
)
?
max
:
a
);
return
static_cast
<
double
>
(
1.0
)
/
(
static_cast
<
double
>
(
1.0
)
+
exp
(
-
tmp
));
}
double
tanh
(
const
double
a
)
{
double
tmp
=
-
2.0
*
a
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
return
(
2.0
/
(
1.0
+
exp
(
tmp
)))
-
1.0
;
}
double
linear
(
const
double
a
)
{
return
a
;
}
/*
* backward activation
*/
template
<
typename
T
>
T
relu
(
const
T
a
,
const
T
b
);
template
<
typename
T
>
T
sigmoid
(
const
T
a
,
const
T
b
);
template
<
typename
T
>
T
tanh
(
const
T
a
,
const
T
b
);
template
<
typename
T
>
T
linear
(
const
T
a
,
const
T
b
);
double
relu
(
const
double
a
,
const
double
b
)
{
return
a
*
(
b
>
0.0
?
1.0
:
0.0
);
}
double
sigmoid
(
const
double
a
,
const
double
b
)
{
return
a
*
b
*
(
static_cast
<
double
>
(
1
)
-
b
);
}
double
tanh
(
const
double
a
,
const
double
b
)
{
return
a
*
(
static_cast
<
double
>
(
1
)
-
b
*
b
);
}
double
linear
(
const
double
a
,
const
double
b
)
{
return
a
;
}
}
// namespace typed
}
// namespace hppl
#ifdef __AVX__
...
...
paddle/operators/math/detail/hl_gpu_functions.h
浏览文件 @
2a8dbd13
...
...
@@ -18,13 +18,10 @@ limitations under the License. */
#include "hl_base.h"
namespace
hppl
{
namespace
typef
{
template
<
typename
T
>
__device__
static
T
relu
(
const
T
a
)
{
return
a
>
0.0
f
?
a
:
0.0
f
;
}
__device__
static
float
relu
(
const
float
a
)
{
return
a
>
0.0
f
?
a
:
0.0
f
;
}
template
<
>
__device__
static
float
sigmoid
(
const
float
a
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
...
...
@@ -32,7 +29,32 @@ __device__ static float sigmoid(const float a) {
return
__fdividef
(
1.0
f
,
1.0
f
+
__expf
(
-
tmp
));
}
template
<
>
__device__
static
float
tanh
(
const
float
a
)
{
return
__fdividef
(
2.0
f
,
(
1.0
f
+
__expf
(
-
2.0
f
*
a
)))
-
1.0
f
;
}
__device__
static
float
linear
(
const
float
a
)
{
return
a
;
}
__device__
static
float
relu
(
const
float
a
,
const
float
b
)
{
return
a
*
(
b
>
0.0
f
?
1.0
f
:
0.0
f
);
}
__device__
static
float
sigmoid
(
const
float
a
,
const
float
b
)
{
return
a
*
b
*
(
1.0
f
-
b
);
}
__device__
static
float
tanh
(
const
float
a
,
const
float
b
)
{
return
a
*
(
1.0
f
-
b
*
b
);
}
__device__
static
float
linear
(
const
float
a
,
const
float
b
)
{
return
a
;
}
}
// namespace typef
namespace
typed
{
__device__
static
double
relu
(
const
double
a
)
{
return
a
>
0.0
?
a
:
0.0
;
}
__device__
static
double
sigmoid
(
const
double
a
)
{
const
double
min
=
SIGMOID_THRESHOLD_MIN
;
const
double
max
=
SIGMOID_THRESHOLD_MAX
;
...
...
@@ -40,40 +62,27 @@ __device__ static double sigmoid(const double a) {
return
1.0
/
(
1.0
+
exp
(
-
tmp
));
}
template
<
>
__device__
static
float
tanh
(
const
float
a
)
{
return
__fdividef
(
2.0
f
,
(
1.0
f
+
__expf
(
-
2.0
f
*
a
)))
-
1.0
f
;
}
template
<
>
__device__
static
double
tanh
(
const
double
a
)
{
return
(
2.0
/
(
1.0
+
exp
(
-
2.0
*
a
)))
-
1.0
;
}
template
<
typename
T
>
__device__
static
T
linear
(
const
T
a
)
{
return
a
;
}
__device__
static
double
linear
(
const
double
a
)
{
return
a
;
}
template
<
typename
T
>
__device__
static
T
relu
(
const
T
a
,
const
T
b
)
{
return
a
*
(
b
>
0.0
f
?
1.0
f
:
0.0
f
);
__device__
static
double
relu
(
const
double
a
,
const
double
b
)
{
return
a
*
(
b
>
0.0
?
1.0
:
0.0
);
}
template
<
typename
T
>
__device__
static
T
sigmoid
(
const
T
a
,
const
T
b
)
{
__device__
static
double
sigmoid
(
const
double
a
,
const
double
b
)
{
return
a
*
b
*
(
1
-
b
);
}
template
<
typename
T
>
__device__
static
T
tanh
(
const
T
a
,
const
T
b
)
{
return
a
*
(
1.0
f
-
b
*
b
);
__device__
static
double
tanh
(
const
double
a
,
const
double
b
)
{
return
a
*
(
1.0
-
b
*
b
);
}
template
<
typename
T
>
__device__
static
T
linear
(
const
T
a
,
const
T
b
)
{
return
a
;
}
__device__
static
double
linear
(
const
double
a
,
const
double
b
)
{
return
a
;
}
}
// namespace typef
}
// namespace hppl
...
...
paddle/operators/math/detail/lstm_cpu_kernel.h
浏览文件 @
2a8dbd13
...
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
namespace
paddle
{
...
...
@@ -23,7 +25,8 @@ namespace detail {
#ifndef __NVCC__
template
<
class
T
,
class
Op
>
void
naive_lstm_forward_one_sequence
(
Op
op
,
lstm_value
value
,
int
frameSize
,
void
naive_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
...
...
@@ -57,9 +60,10 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
rPrevState
=
value
.
prevStateValue
[
i
];
}
hppl
::
cpu
::
ForwardAct
<
T
>
act
;
op
(
rValueIn
,
rValueIg
,
rValueFg
,
rValueOg
,
rPrevState
,
rState
,
rStateAtv
,
rOut
,
rCheckI
,
rCheckF
,
rCheckO
,
hppl
::
cpu
::
forward
[
active_node
]
,
hppl
::
cpu
::
forward
[
active_gate
],
hppl
::
cpu
::
forward
[
active_state
]
);
rOut
,
rCheckI
,
rCheckF
,
rCheckO
,
act
(
active_node
),
act
(
active_gate
)
,
act
(
active_state
)
);
valueIn
[
i
]
=
rValueIn
;
valueIg
[
i
]
=
rValueIg
;
...
...
@@ -72,8 +76,8 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
}
template
<
class
T
,
class
Op
>
void
naive_lstm_backward_one_sequence
(
Op
op
,
lstm_value
value
,
lstm_grad
grad
,
int
frameSize
,
void
naive_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
...
...
@@ -123,11 +127,11 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
rPrevState
=
value
.
prevStateValue
[
i
];
}
hppl
::
cpu
::
BackwardAct
<
T
>
act
;
op
(
rValueIn
,
rValueIg
,
rValueFg
,
rValueOg
,
rGradIn
,
rGradIg
,
rGradFg
,
rGradOg
,
rPrevState
,
rPrevStateGrad
,
rState
,
rStateGrad
,
rStateAtv
,
rOutputGrad
,
rCheckI
,
rCheckF
,
rCheckO
,
rCheckIGrad
,
rCheckFGrad
,
rCheckOGrad
,
hppl
::
cpu
::
backward
[
active_node
],
hppl
::
cpu
::
backward
[
active_gate
],
hppl
::
cpu
::
backward
[
active_state
]);
rCheckOGrad
,
act
(
active_node
),
act
(
active_gate
),
act
(
active_state
));
gradIn
[
i
]
=
rGradIn
;
gradIg
[
i
]
=
rGradIg
;
...
...
@@ -144,8 +148,8 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
}
}
template
<
class
Op
>
void
avx_lstm_forward_one_sequence
(
Op
op
,
lstm_value
value
,
int
frameSize
,
template
<
class
T
,
class
Op
>
void
avx_lstm_forward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
...
...
@@ -195,9 +199,9 @@ void avx_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
#endif
}
template
<
class
Op
>
void
avx_lstm_backward_one_sequence
(
Op
op
,
lstm_value
value
,
lstm_grad
grad
,
int
frameSize
,
template
<
class
T
,
class
Op
>
void
avx_lstm_backward_one_sequence
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
...
...
@@ -271,13 +275,13 @@ void avx_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
}
template
<
class
T
,
class
Op
>
void
cpu_lstm_forward
(
Op
op
,
lstm_value
value
,
int
frameSize
,
void
cpu_lstm_forward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
if
(
Op
::
avx
&&
!
(
frameSize
&
(
8
-
1
))
&&
(
s
izeof
(
T
)
==
4
))
{
avx_lstm_forward_one_sequence
(
op
,
value
,
frameSize
,
active_node
,
active_gate
,
active_state
);
if
(
Op
::
avx
&&
!
(
frameSize
&
(
8
-
1
))
&&
(
s
td
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frameSize
,
active_node
,
active_gate
,
active_state
);
}
else
{
naive_lstm_forward_one_sequence
<
T
>
(
op
,
value
,
frameSize
,
active_node
,
active_gate
,
active_state
);
...
...
@@ -285,13 +289,13 @@ void cpu_lstm_forward(Op op, lstm_value value, int frameSize,
}
template
<
class
T
,
class
Op
>
void
cpu_lstm_backward
(
Op
op
,
lstm_value
value
,
lstm_grad
grad
,
int
frameSize
,
activation_mode_t
active_node
,
void
cpu_lstm_backward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frameSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
if
(
Op
::
avx
&&
!
(
frameSize
&
(
8
-
1
))
&&
(
s
izeof
(
T
)
==
4
))
{
avx_lstm_backward_one_sequence
(
op
,
value
,
grad
,
frameSize
,
active_node
,
active_gate
,
active_state
);
if
(
Op
::
avx
&&
!
(
frameSize
&
(
8
-
1
))
&&
(
s
td
::
is_same
<
T
,
float
>::
value
))
{
avx_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frameSize
,
active_node
,
active_gate
,
active_state
);
}
else
{
naive_lstm_backward_one_sequence
<
T
>
(
op
,
value
,
grad
,
frameSize
,
active_node
,
active_gate
,
active_state
);
...
...
paddle/operators/math/detail/lstm_gpu_kernel.h
浏览文件 @
2a8dbd13
...
...
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/math/detail/lstm_kernel.h"
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -27,10 +29,11 @@ namespace detail {
* grid(frameBlocks, batchBlocks)
*/
template
<
class
T
,
class
Op
,
bool
isBatch
>
__global__
void
KeLstmForward
(
Op
op
,
lstm_value
value
,
int
frameSize
,
int
batchSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
__global__
void
KeLstmForward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frameSize
,
int
batchSize
,
typename
hppl
::
ForwardActType
<
T
>::
type
active_node
,
typename
hppl
::
ForwardActType
<
T
>::
type
active_gate
,
typename
hppl
::
ForwardActType
<
T
>::
type
active_state
)
{
const
int
frameIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frameIdx
>=
frameSize
)
return
;
...
...
@@ -67,8 +70,7 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize,
}
op
(
rValueIn
,
rValueIg
,
rValueFg
,
rValueOg
,
rPrevState
,
rState
,
rStateAtv
,
rOut
,
rCheckI
,
rCheckF
,
rCheckO
,
hppl
::
gpu
::
forward
[
active_node
],
hppl
::
gpu
::
forward
[
active_gate
],
hppl
::
gpu
::
forward
[
active_state
]);
rOut
,
rCheckI
,
rCheckF
,
rCheckO
,
active_node
,
active_gate
,
active_state
);
value
.
gateValue
[
frameIdx
]
=
rValueIn
;
value
.
gateValue
[
frameIdx
+
frameSize
]
=
rValueIg
;
...
...
@@ -85,11 +87,11 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize,
* grid(frameBlocks, batchBlocks)
*/
template
<
class
T
,
class
Op
,
bool
isBatch
>
__global__
void
KeLstmBackward
(
Op
op
,
lstm_value
value
,
lstm_grad
grad
,
int
frameSize
,
int
batch
Size
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
__global__
void
KeLstmBackward
(
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame
Size
,
int
batchSize
,
typename
hppl
::
BackwardActType
<
T
>::
type
active_node
,
typename
hppl
::
BackwardActType
<
T
>::
type
active_gate
,
typename
hppl
::
BackwardActType
<
T
>::
type
active_state
)
{
const
int
frameIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frameIdx
>=
frameSize
)
return
;
...
...
@@ -143,8 +145,7 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad,
op
(
rValueIn
,
rValueIg
,
rValueFg
,
rValueOg
,
rGradIn
,
rGradIg
,
rGradFg
,
rGradOg
,
rPrevState
,
rPrevStateGrad
,
rState
,
rStateGrad
,
rStateAtv
,
rOutputGrad
,
rCheckI
,
rCheckF
,
rCheckO
,
rCheckIGrad
,
rCheckFGrad
,
rCheckOGrad
,
hppl
::
gpu
::
backward
[
active_node
],
hppl
::
gpu
::
backward
[
active_gate
],
hppl
::
gpu
::
backward
[
active_state
]);
active_node
,
active_gate
,
active_state
);
grad
.
gateGrad
[
frameIdx
]
=
rGradIn
;
grad
.
gateGrad
[
frameIdx
+
frameSize
]
=
rGradIg
;
...
...
@@ -177,7 +178,8 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad,
}
template
<
class
T
,
class
Op
>
void
gpu_lstm_forward
(
Op
op
,
lstm_value
value
,
int
frameSize
,
int
batchSize
,
void
gpu_lstm_forward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
int
frameSize
,
int
batchSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
...
...
@@ -194,22 +196,30 @@ void gpu_lstm_forward(Op op, lstm_value value, int frameSize, int batchSize,
grid
=
dim3
((
frameSize
+
32
-
1
)
/
32
,
(
batchSize
+
32
-
1
)
/
32
);
}
using
type
=
typename
hppl
::
ForwardActType
<
T
>::
type
;
hppl
::
gpu
::
ForwardAct
<
T
>
act
;
type
act_node
=
act
(
active_node
);
type
act_gate
=
act
(
active_gate
);
type
act_state
=
act
(
active_state
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
).
stream
();
if
(
batchSize
==
1
)
{
KeLstmForward
<
T
,
Op
,
/* isBatch= */
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
op
,
value
,
frameSize
,
batchSize
,
active_node
,
active_gate
,
active_state
);
/* isBatch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frameSize
,
batchSize
,
act_node
,
act_gate
,
act_state
);
}
else
{
KeLstmForward
<
T
,
Op
,
/* isBatch= */
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
op
,
value
,
frameSize
,
batchSize
,
active_node
,
active_gate
,
active_state
);
/* isBatch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
frameSize
,
batchSize
,
act_node
,
act_gate
,
act_state
);
}
}
template
<
class
T
,
class
Op
>
void
gpu_lstm_backward
(
Op
op
,
lstm_value
value
,
lstm_grad
grad
,
int
frameSize
,
int
batchSize
,
activation_mode_t
active_node
,
void
gpu_lstm_backward
(
const
platform
::
DeviceContext
&
context
,
Op
op
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frameSize
,
int
batchSize
,
activation_mode_t
active_node
,
activation_mode_t
active_gate
,
activation_mode_t
active_state
)
{
dim3
threads
;
...
...
@@ -225,16 +235,22 @@ void gpu_lstm_backward(Op op, lstm_value value, lstm_grad grad, int frameSize,
grid
=
dim3
((
frameSize
+
32
-
1
)
/
32
,
(
batchSize
+
32
-
1
)
/
32
);
}
using
type
=
typename
hppl
::
BackwardActType
<
T
>::
type
;
hppl
::
gpu
::
BackwardAct
<
T
>
act
;
type
act_node
=
act
(
active_node
);
type
act_gate
=
act
(
active_gate
);
type
act_state
=
act
(
active_state
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
).
stream
();
if
(
batchSize
==
1
)
{
KeLstmBackward
<
T
,
Op
,
/* isBatch= */
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
op
,
value
,
grad
,
frameSize
,
batchSize
,
active_node
,
active_gate
,
active_state
);
/* isBatch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frameSize
,
batchSize
,
act_node
,
act_gate
,
act_state
);
}
else
{
KeLstmBackward
<
T
,
Op
,
/* isBatch= */
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
op
,
value
,
grad
,
frameSize
,
batchSize
,
active_node
,
active_gate
,
active_state
);
/* isBatch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
op
,
value
,
grad
,
frameSize
,
batchSize
,
act_node
,
act_gate
,
act_state
);
}
}
...
...
paddle/operators/math/detail/lstm_kernel.h
浏览文件 @
2a8dbd13
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_activation_functions.h"
#include "
paddle/operators/math/detail/
hl_activation_functions.h"
#ifdef __CUDA_ARCH__
#define INLINE __device__ inline
...
...
@@ -33,9 +33,9 @@ class lstm {
INLINE
void
operator
()(
T
&
valueIn
,
T
&
valueIg
,
T
&
valueFg
,
T
&
valueOg
,
T
&
prevState
,
T
&
state
,
T
&
stateAtv
,
T
&
output
,
T
&
checkI
,
T
&
checkF
,
T
&
checkO
,
Active
<
T
>::
forward
actInput
,
Active
<
T
>::
forward
actGate
,
Active
<
T
>::
forward
actState
)
{
typename
hppl
::
ForwardActType
<
T
>::
type
actInput
,
typename
hppl
::
ForwardActType
<
T
>::
type
actGate
,
typename
hppl
::
ForwardActType
<
T
>::
type
actState
)
{
valueIn
=
actInput
(
valueIn
);
valueIg
=
actGate
(
valueIg
+
prevState
*
checkI
);
valueFg
=
actGate
(
valueFg
+
prevState
*
checkF
);
...
...
@@ -53,9 +53,9 @@ class lstm {
__m256
&
valueOg
,
__m256
&
prevState
,
__m256
&
state
,
__m256
&
stateAtv
,
__m256
&
output
,
__m256
&
checkI
,
__m256
&
checkF
,
__m256
&
checkO
,
Active
<
__m256
>::
forward
actInput
,
Active
<
__m256
>::
forward
actGate
,
Active
<
__m256
>::
forward
actState
)
{
hppl
::
Active
<
__m256
>::
forward
actInput
,
hppl
::
Active
<
__m256
>::
forward
actGate
,
hppl
::
Active
<
__m256
>::
forward
actState
)
{
valueIn
=
actInput
(
valueIn
);
valueIg
=
actGate
(
_mm256_add_ps
(
valueIg
,
_mm256_mul_ps
(
prevState
,
checkI
)));
valueFg
=
actGate
(
_mm256_add_ps
(
valueFg
,
_mm256_mul_ps
(
prevState
,
checkF
)));
...
...
@@ -81,9 +81,9 @@ class lstm {
T
&
prevState
,
T
&
prevStateGrad
,
T
&
state
,
T
&
stateGrad
,
T
&
stateAtv
,
T
&
outputGrad
,
T
&
checkI
,
T
&
checkF
,
T
&
checkO
,
T
&
checkIGrad
,
T
&
checkFGrad
,
T
&
checkOGrad
,
Active
<
T
>::
backward
actInput
,
Active
<
T
>::
backward
actGate
,
Active
<
T
>::
backward
actState
)
{
typename
hppl
::
BackwardActType
<
T
>::
type
actInput
,
typename
hppl
::
BackwardActType
<
T
>::
type
actGate
,
typename
hppl
::
BackwardActType
<
T
>::
type
actState
)
{
gradOg
=
actGate
(
outputGrad
*
stateAtv
,
valueOg
);
stateGrad
+=
actState
(
outputGrad
*
valueOg
,
stateAtv
)
+
gradOg
*
checkO
;
gradIn
=
actInput
(
stateGrad
*
valueIg
,
valueIn
);
...
...
@@ -106,9 +106,10 @@ class lstm {
__m256
&
stateGrad
,
__m256
&
stateAtv
,
__m256
&
outputGrad
,
__m256
&
checkI
,
__m256
&
checkF
,
__m256
&
checkO
,
__m256
&
checkIGrad
,
__m256
&
checkFGrad
,
__m256
&
checkOGrad
,
Active
<
__m256
>::
backward
actInput
,
Active
<
__m256
>::
backward
actGate
,
Active
<
__m256
>::
backward
actState
)
{
__m256
&
checkOGrad
,
hppl
::
Active
<
__m256
>::
backward
actInput
,
hppl
::
Active
<
__m256
>::
backward
actGate
,
hppl
::
Active
<
__m256
>::
backward
actState
)
{
gradOg
=
actGate
(
_mm256_mul_ps
(
outputGrad
,
stateAtv
),
valueOg
);
stateGrad
=
_mm256_add_ps
(
actState
(
_mm256_mul_ps
(
outputGrad
,
valueOg
),
stateAtv
),
stateGrad
);
...
...
@@ -134,5 +135,3 @@ class lstm {
}
// namespace math
}
// namespace operators
}
// namespace paddle
#endif
/* HL_LSTM_OPS_CUH_ */
paddle/operators/math/lstm_compute.cc
浏览文件 @
2a8dbd13
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "
LstmC
ompute.h"
#include "
paddle/operators/math/lstm_c
ompute.h"
#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h"
...
...
@@ -22,19 +22,20 @@ namespace math {
template
<
class
T
>
struct
LstmUnitFunctor
<
platform
::
CPUPlace
,
T
>
{
static
void
compute
(
lstm_value
value
,
int
frame_size
,
int
batch_size
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
cpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame
S
ize
,
detail
::
cpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frame
_s
ize
,
ActiveType
(
cand_act
),
ActiveType
(
gate_act
),
ActiveType
(
cell_act
));
value
.
gateValue
+=
frame
S
ize
*
4
;
value
.
stateValue
+=
frame
S
ize
;
value
.
stateActiveValue
+=
frame
S
ize
;
value
.
outputValue
+=
frame
S
ize
;
value
.
gateValue
+=
frame
_s
ize
*
4
;
value
.
stateValue
+=
frame
_s
ize
;
value
.
stateActiveValue
+=
frame
_s
ize
;
value
.
outputValue
+=
frame
_s
ize
;
if
(
value
.
prevStateValue
)
{
value
.
prevStateValue
+=
frame
S
ize
;
value
.
prevStateValue
+=
frame
_s
ize
;
}
}
}
...
...
@@ -42,31 +43,36 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
template
<
class
T
>
struct
LstmUnitGradFunctor
<
platform
::
CPUPlace
,
T
>
{
static
void
compute
(
lstm_value
value
,
lstm_grad
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch
S
ize
;
b
++
)
{
for
(
int
b
=
0
;
b
<
batch
_s
ize
;
b
++
)
{
detail
::
cpu_lstm_backward
(
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame
S
ize
,
ActiveType
(
cand_act
),
frame
_s
ize
,
ActiveType
(
cand_act
),
ActiveType
(
gate_act
),
ActiveType
(
cell_act
));
value
.
gateValue
+=
frame
S
ize
*
4
;
value
.
stateValue
+=
frame
S
ize
;
value
.
stateActiveValue
+=
frame
S
ize
;
value
.
outputValue
+=
frame
S
ize
;
value
.
gateValue
+=
frame
_s
ize
*
4
;
value
.
stateValue
+=
frame
_s
ize
;
value
.
stateActiveValue
+=
frame
_s
ize
;
value
.
outputValue
+=
frame
_s
ize
;
if
(
value
.
prevStateValue
)
{
value
.
prevStateValue
+=
frame
S
ize
;
value
.
prevStateValue
+=
frame
_s
ize
;
}
grad
.
gateGrad
+=
frame
S
ize
*
4
;
grad
.
stateGrad
+=
frame
S
ize
;
grad
.
stateActiveGrad
+=
frame
S
ize
;
grad
.
outputGrad
+=
frame
S
ize
;
grad
.
gateGrad
+=
frame
_s
ize
*
4
;
grad
.
stateGrad
+=
frame
_s
ize
;
grad
.
stateActiveGrad
+=
frame
_s
ize
;
grad
.
outputGrad
+=
frame
_s
ize
;
if
(
grad
.
prevStateGrad
)
{
grad
.
prevStateGrad
+=
frame
S
ize
;
grad
.
prevStateGrad
+=
frame
_s
ize
;
}
}
};
}
};
template
class
LstmUnitFunctor
<
platform
::
CPUPlace
,
float
>;
template
class
LstmUnitGradFunctor
<
platform
::
CPUPlace
,
double
>;
}
// namespace math
}
// namespace operators
...
...
paddle/operators/math/lstm_compute.cu
浏览文件 @
2a8dbd13
...
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "LstmCompute.h"
#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h"
#include "paddle/operators/math/lstm_compute.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -22,19 +22,20 @@ namespace math {
template
<
class
T
>
struct
LstmUnitFunctor
<
platform
::
GPUPlace
,
T
>
{
static
void
compute
(
lstm_value
value
,
int
frame_size
,
int
batch_size
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
gpu_lstm_forward
(
detail
::
forward
::
lstm
<
T
>
(),
value
,
frameSiz
e
,
ActiveType
(
cand_act
),
ActiveType
(
gate
_act
),
ActiveType
(
cell_act
));
value
.
gateValue
+=
frame
S
ize
*
4
;
value
.
stateValue
+=
frame
S
ize
;
value
.
stateActiveValue
+=
frame
S
ize
;
value
.
outputValue
+=
frame
S
ize
;
detail
::
gpu_lstm_forward
(
context
,
detail
::
forward
::
lstm
<
T
>
(),
valu
e
,
frame_size
,
batch_size
,
ActiveType
(
cand
_act
),
ActiveType
(
gate_act
),
ActiveType
(
cell_act
));
value
.
gateValue
+=
frame
_s
ize
*
4
;
value
.
stateValue
+=
frame
_s
ize
;
value
.
stateActiveValue
+=
frame
_s
ize
;
value
.
outputValue
+=
frame
_s
ize
;
if
(
value
.
prevStateValue
)
{
value
.
prevStateValue
+=
frame
S
ize
;
value
.
prevStateValue
+=
frame
_s
ize
;
}
}
}
...
...
@@ -42,31 +43,37 @@ struct LstmUnitFunctor<platform::GPUPlace, T> {
template
<
class
T
>
struct
LstmUnitGradFunctor
<
platform
::
GPUPlace
,
T
>
{
static
void
compute
(
lstm_value
value
,
lstm_grad
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
)
{
for
(
int
b
=
0
;
b
<
batchSize
;
b
++
)
{
detail
::
gpu_lstm_backward
(
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frameSize
,
ActiveType
(
cand_act
),
ActiveType
(
gate_act
),
ActiveType
(
cell_act
));
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
detail
::
gpu_lstm_backward
(
context
,
detail
::
backward
::
lstm
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
ActiveType
(
cand_act
),
ActiveType
(
gate_act
),
ActiveType
(
cell_act
));
value
.
gateValue
+=
frame
S
ize
*
4
;
value
.
stateValue
+=
frame
S
ize
;
value
.
stateActiveValue
+=
frame
S
ize
;
value
.
outputValue
+=
frame
S
ize
;
value
.
gateValue
+=
frame
_s
ize
*
4
;
value
.
stateValue
+=
frame
_s
ize
;
value
.
stateActiveValue
+=
frame
_s
ize
;
value
.
outputValue
+=
frame
_s
ize
;
if
(
value
.
prevStateValue
)
{
value
.
prevStateValue
+=
frame
S
ize
;
value
.
prevStateValue
+=
frame
_s
ize
;
}
grad
.
gateGrad
+=
frame
S
ize
*
4
;
grad
.
stateGrad
+=
frame
S
ize
;
grad
.
stateActiveGrad
+=
frame
S
ize
;
grad
.
outputGrad
+=
frame
S
ize
;
grad
.
gateGrad
+=
frame
_s
ize
*
4
;
grad
.
stateGrad
+=
frame
_s
ize
;
grad
.
stateActiveGrad
+=
frame
_s
ize
;
grad
.
outputGrad
+=
frame
_s
ize
;
if
(
grad
.
prevStateGrad
)
{
grad
.
prevStateGrad
+=
frame
S
ize
;
grad
.
prevStateGrad
+=
frame
_s
ize
;
}
}
};
}
};
template
class
LstmUnitFunctor
<
platform
::
GPUPlace
,
float
>;
template
class
LstmUnitGradFunctor
<
platform
::
GPUPlace
,
double
>;
}
// namespace math
}
// namespace operators
...
...
paddle/operators/math/lstm_compute.h
浏览文件 @
2a8dbd13
...
...
@@ -14,7 +14,8 @@ limitations under the License. */
#pragma once
#include "paddle/platform/macros.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -28,28 +29,28 @@ typedef enum {
HL_ACTIVATION_END
}
activation_mode_t
;
template
<
T
>
struct
lstm_v
alue
{
real
*
gateValue
;
real
*
prevStateValue
;
real
*
stateValue
;
real
*
stateActiveValue
;
real
*
outputValue
;
real
*
checkIg
;
real
*
checkFg
;
real
*
checkOg
;
template
<
class
T
>
struct
LstmMetaV
alue
{
T
*
gateValue
;
T
*
prevStateValue
;
T
*
stateValue
;
T
*
stateActiveValue
;
T
*
outputValue
;
T
*
checkIg
;
T
*
checkFg
;
T
*
checkOg
;
};
template
<
T
>
struct
lstm_g
rad
{
real
*
gateGrad
;
real
*
prevStateGrad
;
real
*
stateGrad
;
real
*
stateActiveGrad
;
real
*
outputGrad
;
real
*
checkIgGrad
;
real
*
checkFgGrad
;
real
*
checkOgGrad
;
template
<
class
T
>
struct
LstmMetaG
rad
{
T
*
gateGrad
;
T
*
prevStateGrad
;
T
*
stateGrad
;
T
*
stateActiveGrad
;
T
*
outputGrad
;
T
*
checkIgGrad
;
T
*
checkFgGrad
;
T
*
checkOgGrad
;
};
activation_mode_t
ActiveType
(
const
std
::
string
&
type
)
{
...
...
@@ -69,7 +70,8 @@ activation_mode_t ActiveType(const std::string &type) {
template
<
typename
Place
,
typename
T
>
class
LstmUnitFunctor
{
public:
static
void
compute
(
lstm_value
value
,
int
frame_size
,
int
batch_size
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
);
};
...
...
@@ -77,8 +79,9 @@ class LstmUnitFunctor {
template
<
typename
Place
,
typename
T
>
class
LstmUnitGradFunctor
{
public:
static
void
compute
(
lstm_value
value
,
lstm_grad
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
static
void
compute
(
const
platform
::
DeviceContext
&
context
,
LstmMetaValue
<
T
>
value
,
LstmMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
std
::
string
gate_act
,
std
::
string
cell_act
,
std
::
string
cand_act
);
};
...
...
paddle/operators/math/sequence2batch.cc
浏览文件 @
2a8dbd13
...
...
@@ -22,12 +22,14 @@ template <typename T>
class
CopyMatrixRowsFunctor
<
platform
::
CPUPlace
,
T
>
{
public:
void
operator
()(
const
platform
::
DeviceContext
&
context
,
const
framework
::
Tensor
&
src
,
const
size_t
*
index
,
framework
::
Tensor
&
dst
,
bool
is_src_index
)
{
const
framework
::
LoD
Tensor
&
src
,
const
size_t
*
index
,
framework
::
LoD
Tensor
&
dst
,
bool
is_src_index
)
{
auto
src_dims
=
src
.
dims
();
auto
dst_dims
=
dst
.
dims
();
PADDLE_ENFORCE
(
src_dims
.
size
(),
2
,
"The src must be matrix with rank 2."
);
PADDLE_ENFORCE
(
dst_dims
.
size
(),
2
,
"The dst must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
src_dims
.
size
(),
2UL
,
"The src must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
dst_dims
.
size
(),
2UL
,
"The dst must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
src_dims
[
1
],
dst_dims
[
1
],
"The width of src and dst must be same."
);
auto
height
=
dst_dims
[
0
];
...
...
@@ -50,7 +52,9 @@ template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
template
class
CopyMatrixRowsFunctor
<
platform
::
CPUPlace
,
double
>;
template
class
LoDTensor2BatchFunctor
<
platform
::
CPUPlace
,
float
>;
template
class
Batch2LoDTensor2Functor
<
platform
::
CPUPlace
,
float
>;
template
class
LoDTensor2BatchFunctor
<
platform
::
CPUPlace
,
double
>;
template
class
Batch2LoDTensorFunctor
<
platform
::
CPUPlace
,
float
>;
template
class
Batch2LoDTensorFunctor
<
platform
::
CPUPlace
,
double
>;
}
// namespace math
}
// namespace operators
...
...
paddle/operators/math/sequence2batch.cu
浏览文件 @
2a8dbd13
...
...
@@ -19,8 +19,8 @@ namespace operators {
namespace
math
{
template
<
typename
T
,
int
BlockDimX
,
int
BlockDimY
,
int
GridDimX
>
__global__
void
CopyMatrixRowsKernel
(
const
T
*
src
,
T
*
dst
,
const
in
t
*
index
,
int
height
,
in
t
width
,
__global__
void
CopyMatrixRowsKernel
(
const
T
*
src
,
T
*
dst
,
const
size_
t
*
index
,
int
64_t
height
,
int64_
t
width
,
const
bool
is_src_index
)
{
int
idx
=
threadIdx
.
x
;
int
idy
=
threadIdx
.
y
;
...
...
@@ -28,7 +28,7 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const int* index,
while
(
id
<
height
)
{
int
src_idx
=
is_src_index
?
index
[
id
]
:
id
;
int
dst_idx
=
is_src_index
?
id
:
index
[
id
];
T
*
src_data
=
src
+
src_idx
*
width
;
const
T
*
src_data
=
src
+
src_idx
*
width
;
T
*
dst_data
=
dst
+
dst_idx
*
width
;
for
(
int
i
=
idx
;
i
<
width
;
i
+=
BlockDimX
)
{
dst_data
[
i
]
=
src_data
[
i
];
...
...
@@ -41,12 +41,14 @@ template <typename T>
class
CopyMatrixRowsFunctor
<
platform
::
GPUPlace
,
T
>
{
public:
void
operator
()(
const
platform
::
DeviceContext
&
context
,
const
framework
::
Tensor
&
src
,
const
size_t
*
index
,
framework
::
Tensor
&
dst
,
bool
is_src_index
)
{
const
framework
::
LoD
Tensor
&
src
,
const
size_t
*
index
,
framework
::
LoD
Tensor
&
dst
,
bool
is_src_index
)
{
auto
src_dims
=
src
.
dims
();
auto
dst_dims
=
dst
.
dims
();
PADDLE_ENFORCE
(
src_dims
.
size
(),
2
,
"The src must be matrix with rank 2."
);
PADDLE_ENFORCE
(
dst_dims
.
size
(),
2
,
"The dst must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
src_dims
.
size
(),
2
,
"The src must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
dst_dims
.
size
(),
2
,
"The dst must be matrix with rank 2."
);
PADDLE_ENFORCE_EQ
(
src_dims
[
1
],
dst_dims
[
1
],
"The width of src and dst must be same."
);
auto
height
=
dst_dims
[
0
];
...
...
@@ -56,9 +58,10 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
).
stream
();
CopyMatrixRowsKernel
<
T
,
128
,
8
,
8
><<<
grid
,
threads
,
0
,
stream
>>>
(
src_data
,
dst_data
,
index
,
height
,
width
);
src_data
,
dst_data
,
index
,
height
,
width
,
is_src_index
);
}
};
...
...
@@ -66,7 +69,9 @@ template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
template
class
CopyMatrixRowsFunctor
<
platform
::
GPUPlace
,
double
>;
template
class
LoDTensor2BatchFunctor
<
platform
::
GPUPlace
,
float
>;
template
class
Batch2LoDTensor2Functor
<
platform
::
GPUPlace
,
float
>;
template
class
LoDTensor2BatchFunctor
<
platform
::
GPUPlace
,
double
>;
template
class
Batch2LoDTensorFunctor
<
platform
::
GPUPlace
,
float
>;
template
class
Batch2LoDTensorFunctor
<
platform
::
GPUPlace
,
double
>;
}
// namespace math
}
// namespace operators
...
...
paddle/operators/math/sequence2batch.h
浏览文件 @
2a8dbd13
...
...
@@ -12,6 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -25,8 +30,8 @@ class CopyMatrixRowsFunctor {
// copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index.
void
operator
()(
const
platform
::
DeviceContext
&
context
,
const
framework
::
Tensor
&
src
,
const
size_t
*
index
,
framework
::
Tensor
&
dst
,
const
bool
is_src_index
);
const
framework
::
LoD
Tensor
&
src
,
const
size_t
*
index
,
framework
::
LoD
Tensor
&
dst
,
const
bool
is_src_index
);
};
template
<
typename
Place
,
typename
T
>
...
...
@@ -35,8 +40,8 @@ class LoDTensor2BatchFunctor {
void
operator
()(
const
platform
::
DeviceContext
&
context
,
const
framework
::
LoDTensor
&
lod_tensor
,
framework
::
LoDTensor
&
batch
,
const
bool
is_reverse
)
const
{
auto
lods
=
lod_tensor
->
lod
();
PADDLE_ENFORCE_EQ
(
lod
.
size
(),
1UL
,
"Only support one level sequence now."
);
auto
lods
=
lod_tensor
.
lod
();
PADDLE_ENFORCE_EQ
(
lod
s
.
size
(),
1UL
,
"Only support one level sequence now."
);
auto
lod
=
lods
[
0
];
// Calculate the length of each sequence and
...
...
@@ -47,7 +52,7 @@ class LoDTensor2BatchFunctor {
//
struct
SeqInfo
{
SeqInfo
(
int
start
,
int
length
,
int
seq_idx
)
:
start
(
start
),
length
(
length
),
seq
I
dx
(
seq_idx
)
{}
:
start
(
start
),
length
(
length
),
seq
_i
dx
(
seq_idx
)
{}
int
start
;
int
length
;
int
seq_idx
;
...
...
@@ -78,19 +83,19 @@ class LoDTensor2BatchFunctor {
// The batch number represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
auto
batch_lods
=
batch
->
lod
();
if
(
!
batch_lods
)
{
batch_lods
->
resize
(
2
);
auto
batch_lods
=
batch
.
lod
();
if
(
batch_lods
.
size
()
==
0
)
{
batch_lods
.
resize
(
2
);
}
// batch_lods[0] is the start positions for batch LoDTensor
int
num_batch
=
(
size_t
)
seq_info
[
0
].
length
;
batch_lods
[
0
]
->
resize
(
num_batch
+
1
);
batch_lods
[
0
]
.
resize
(
num_batch
+
1
);
// batch_lods[1] is the raw index in the input LoDTensor
auto
dims
=
lod_tensor
->
dims
();
batch_lods
[
1
]
->
resize
(
dims
[
0
]);
auto
dims
=
lod_tensor
.
dims
();
batch_lods
[
1
]
.
resize
(
dims
[
0
]);
auto
*
batch_starts
=
batch_lods
[
0
].
data
();
auto
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
size_t
*
batch_starts
=
batch_lods
[
0
].
data
();
size_t
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
batch_starts
[
0
]
=
0
;
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
batch_id
=
batch_starts
[
n
];
...
...
@@ -112,17 +117,27 @@ class LoDTensor2BatchFunctor {
}
CopyMatrixRowsFunctor
<
Place
,
T
>
to_batch
;
to_batch
(
context
,
lod_tensor
,
batch
,
true
);
to_batch
(
context
,
lod_tensor
,
seq2batch_idx
,
batch
,
true
);
}
};
template
<
typename
Place
,
typename
T
>
class
Batch2LoDTensor
2
Functor
{
class
Batch2LoDTensorFunctor
{
public:
void
operator
()(
const
platform
::
DeviceContext
&
context
,
const
framework
::
LoDTensor
&
batch
,
framework
::
LoDTensor
&
lod_tensor
,
const
bool
is_reverse
)
const
;
framework
::
LoDTensor
&
lod_tensor
)
const
{
auto
in_lod
=
batch
.
lod
();
PADDLE_ENFORCE_EQ
(
in_lod
.
size
(),
2UL
,
"The LoD size of input `batch` should be 2."
);
auto
out_lod
=
lod_tensor
.
lod
();
PADDLE_ENFORCE_EQ
(
out_lod
[
0
][
0
],
out_lod
[
1
].
size
());
PADDLE_ENFORCE_EQ
(
out_lod
[
0
][
0
],
lod_tensor
.
dims
()[
0
]);
PADDLE_ENFORCE_EQ
(
out_lod
[
0
][
0
],
batch
.
dims
()[
0
]);
CopyMatrixRowsFunctor
<
Place
,
T
>
to_seq
;
size_t
*
index
=
out_lod
[
1
].
data
();
to_seq
(
context
,
batch
,
index
,
lod_tensor
,
false
);
}
};
}
// namespace math
...
...
python/paddle/v2/framework/tests/test_lstm_op.py
0 → 100644
浏览文件 @
2a8dbd13
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
identity
(
x
):
return
x
def
sigmoid
(
x
):
return
1.
/
(
1.
+
np
.
exp
(
-
x
))
def
tanh
(
x
):
return
2.
*
sigmoid
(
2.
*
x
)
-
1.
def
relu
(
x
):
return
np
.
maximum
(
x
,
0
)
def
lstm
(
input
,
# T x 4D
lod
,
# 1 x N
h0
=
None
,
# N x D
c0
=
None
,
# N x D
w_h
=
None
,
# D x 4D
w_b
=
None
,
# 1 x 4D
w_c
=
None
,
# 1 x 3D
is_reverse
=
False
,
gate_act
=
None
,
cell_act
=
None
,
cand_act
=
None
):
def
_step
(
x
,
w_h
,
w_c
,
h_pre
,
c_pre
,
gate_act
,
cell_act
,
cand_act
):
g
=
np
.
dot
(
h_pre
,
w_h
)
# 1 x 4D
g
=
g
+
x
g
=
np
.
reshape
(
g
,
(
1
,
g
.
size
))
c
,
g_i
,
g_f
,
g_o
=
np
.
split
(
g
,
4
,
axis
=
1
)
if
w_c
is
None
:
g_i
=
gate_act
(
g_i
)
# 1 x D
g_f
=
gate_act
(
g_f
)
# 1 x D
else
:
w_ic
,
w_fc
,
w_oc
=
np
.
split
(
w_c
,
3
,
axis
=
1
)
g_i
=
gate_act
(
g_i
+
w_ic
*
c_pre
)
# 1 x D
g_f
=
gate_act
(
g_f
+
w_fc
*
c_pre
)
# 1 x D
c
=
g_f
*
c_pre
+
g_i
*
cand_act
(
c
)
# 1 x D
if
w_c
is
None
:
g_o
=
gate_act
(
g_o
)
# 1 x D
else
:
_
,
_
,
w_oc
=
np
.
split
(
w_c
,
3
,
axis
=
1
)
g_o
=
gate_act
(
g_o
+
w_oc
*
c
)
# 1 x D
h
=
g_o
*
cell_act
(
c
)
return
h
,
c
offset
=
lod
[
0
]
batch_size
=
len
(
offset
)
-
1
hidden
=
[]
cell
=
[]
if
w_b
is
not
None
:
input
=
input
+
np
.
tile
(
w_b
,
(
offset
[
-
1
],
1
))
for
i
in
range
(
batch_size
):
# compute one sequence
seq_len
=
offset
[
i
+
1
]
-
offset
[
i
]
x
=
input
[
offset
[
i
]:
offset
[
i
+
1
],
:]
h_pre
=
h0
[
i
]
# 1 x D
c_pre
=
h0
[
i
]
# 1 x D
for
j
in
range
(
seq_len
):
# compute one step
h_pre
,
c_pre
=
_step
(
x
[
j
],
w_h
,
w_c
,
h_pre
,
c_pre
,
gate_act
,
cell_act
,
cand_act
)
hidden
.
append
(
h_pre
.
flatten
())
cell
.
append
(
c_pre
.
flatten
())
hidden
=
np
.
array
(
hidden
).
astype
(
"float64"
)
cell
=
np
.
array
(
cell
).
astype
(
"float64"
)
assert
hidden
.
shape
==
(
input
.
shape
[
0
],
input
.
shape
[
1
]
/
4
)
assert
cell
.
shape
==
(
input
.
shape
[
0
],
input
.
shape
[
1
]
/
4
)
return
hidden
,
cell
class
LstmUnitTest
(
OpTest
):
def
set_data
(
self
):
lod
=
[[
0
,
2
,
6
,
9
]]
shape
=
(
9
,
64
)
x
=
np
.
random
.
normal
(
size
=
(
9
,
4
*
64
)).
astype
(
"float64"
)
h0
=
np
.
random
.
normal
(
size
=
(
4
,
64
)).
astype
(
"float64"
)
c0
=
np
.
random
.
normal
(
size
=
(
4
,
64
)).
astype
(
"float64"
)
w
=
np
.
random
.
normal
(
size
=
(
64
,
4
*
64
)).
astype
(
"float64"
)
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
64
)).
astype
(
"float64"
)
w_b
=
b
[:,
4
*
64
]
w_c
=
b
[:,
4
*
64
:]
h
,
c
=
lstm
(
x
,
lod
,
h0
,
c0
,
w
,
w_b
,
w_c
,
False
,
sigmoid
,
tanh
,
tanh
)
self
.
inputs
=
{
'Input'
:
x
,
'H0'
:
h0
,
'C0'
:
c0
,
'Weight'
:
w
,
'Bias'
:
b
}
self
.
inputs
=
{
'Hidden'
:
h
,
'Cell'
:
c
}
self
.
attrs
=
{
'usePeepholes'
:
True
,
'isReverse'
:
False
,
'gateActivation'
:
'sigmoid'
,
'cellActivation'
:
'tanh'
,
'candidateActivation'
:
'tanh'
}
def
setUp
(
self
):
self
.
set_data
()
self
.
op_type
=
"lstm"
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录