Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
4ff16eb2
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4ff16eb2
编写于
9月 07, 2020
作者:
G
GaoWei8
提交者:
GitHub
9月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add padding cudnn interface (#26370)
* add lstm cudnn of padding data and refine cudnn codes
上级
35f53ecd
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
816 addition
and
195 deletion
+816
-195
paddle/fluid/operators/cudnn_lstm_op.cc
paddle/fluid/operators/cudnn_lstm_op.cc
+28
-20
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+141
-64
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+266
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+8
-0
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+373
-111
未找到文件。
paddle/fluid/operators/cudnn_lstm_op.cc
浏览文件 @
4ff16eb2
...
...
@@ -37,41 +37,42 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"LastC"
),
"Output"
,
"LastC"
,
"CudnnLSTM"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
auto
init_dims
=
ctx
->
GetInputDim
(
"InitH"
);
auto
init_h_dims
=
ctx
->
GetInputDim
(
"InitH"
);
auto
init_c_dims
=
ctx
->
GetInputDim
(
"InitC"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
3
,
platform
::
errors
::
InvalidArgument
(
"The rank of Input in CudnnLSTM must be 3. But "
"received Input's rank is %d."
,
in_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
init_dims
.
size
(),
3
,
PADDLE_ENFORCE_EQ
(
init_
h_
dims
.
size
(),
3
,
platform
::
errors
::
InvalidArgument
(
"The rank of InitH in CudnnLSTM must be 3. But "
"received InitH's rank is %d."
,
init_dims
.
size
()));
init_
h_
dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
[
1
],
init_dims
[
1
],
platform
::
errors
::
InvalidArgument
(
"The in_dims[1] (Input dims) and init_dims[1] (InitH "
"dims) should be equal. But "
"received in_dims[1] is %d and init_dims[1] is %d."
,
in_dims
[
1
],
init_dims
[
1
]));
PADDLE_ENFORCE_EQ
(
in_dims
[
2
],
init_dims
[
2
],
PADDLE_ENFORCE_EQ
(
in_dims
[
1
],
init_h_dims
[
1
],
platform
::
errors
::
InvalidArgument
(
"The in_dims[1] (Input dims) and init_h_dims[1] (InitH "
"dims) should be equal. But "
"received in_dims[1] is %d and init_h_dims[1] is %d."
,
in_dims
[
1
],
init_h_dims
[
1
]));
PADDLE_ENFORCE_EQ
(
init_c_dims
,
init_h_dims
,
platform
::
errors
::
InvalidArgument
(
"The
in_dims[2] (Input dims) and init_dims[2] (
InitH "
"dims
)
should be equal. But "
"received in
_dims[2] is %d and init_dims[2]
is %d."
,
in
_dims
[
2
],
init_dims
[
2
]
));
"The
InitC dims and
InitH "
"dims should be equal. But "
"received in
it_c_dims is %d and init_h_dims
is %d."
,
in
it_c_dims
,
init_h_dims
));
auto
out_dims
=
in_dims
;
auto
hidden_size
=
ctx
->
Attrs
().
Get
<
int
>
(
"hidden_size"
);
bool
is_bidirec
=
ctx
->
Attrs
().
Get
<
bool
>
(
"is_bidirec"
);
out_dims
[
2
]
=
is_bidirec
?
hidden_size
*
2
:
hidden_size
;
auto
last_dims
=
init_dims
;
last_dims
[
0
]
=
is_bidirec
?
last_dims
[
0
]
*
2
:
last_dims
[
0
];
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
ctx
->
SetOutputDim
(
"LastH"
,
last
_dims
);
ctx
->
SetOutputDim
(
"LastC"
,
last
_dims
);
ctx
->
SetOutputDim
(
"LastH"
,
init_c
_dims
);
ctx
->
SetOutputDim
(
"LastC"
,
init_h
_dims
);
}
protected:
...
...
@@ -95,7 +96,7 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
"different batch)"
"batch_size is the instance number of this batch"
"input_size is the hidden size of the input."
"input_
hidden_
size and the hidden_size in the next may not be same"
);
"input_size and the hidden_size in the next may not be same"
);
AddInput
(
"InitH"
,
"(Tensor) the initial hidden state of the LSTM"
"input. This is a tensor with shape (num_layers x batch_size x "
...
...
@@ -154,6 +155,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
.
SetDefault
(
1
);
AddAttr
<
bool
>
(
"is_test"
,
"True if in test phase."
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"seed"
,
"seed to used if fix_seed is True"
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
int
>>
(
"sequence_length"
,
"(vector<int>) When the input data is padding, "
"set this parameter. This parameter represents "
"the variable sequence"
"lengths in a batch. The size of the vector has "
"to equal the batch_size."
)
.
SetDefault
({});
AddComment
(
R"DOC(
CUDNN LSTM implementation
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
4ff16eb2
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -55,50 +56,96 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
int
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
int
seed
=
ctx
.
Attr
<
int
>
(
"seed"
);
auto
sequence_length
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"sequence_length"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
CudnnRNNCache
*
cudnn_rnn_cache
=
new
CudnnRNNCache
();
int
seq_length
=
x
->
dims
()[
0
];
int
batch_size
=
x
->
dims
()[
1
];
int
input_size
=
x
->
dims
()[
2
];
int
weight_numel
=
w
->
numel
();
bool
state_initialized
=
state_out
->
IsInitialized
()
?
true
:
false
;
auto
input_w_numel
=
w
->
numel
();
auto
seq_len
=
x
->
dims
()[
0
];
auto
batch_size
=
x
->
dims
()[
1
];
auto
input_dim
=
x
->
dims
()[
2
];
size_t
workspace_size
;
size_t
reserve_size
;
bool
state_initialized
=
state_out
->
IsInitialized
()
?
true
:
false
;
cudnnDataType_t
cudnn_type
=
platform
::
ToCudnnDataType
(
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
))));
cudnn_rnn_cache
->
init
(
handle
,
ctx
.
GetPlace
(),
seq_len
,
batch_size
,
input_dim
,
hidden_size
,
num_layers
,
dropout_prob
,
is_bidirec
,
seed
,
input_w_numel
,
&
reserve_size
,
state_out
,
state_initialized
,
cudnn_type
);
platform
::
ScopedRNNBase
rnn
(
seq_length
,
batch_size
,
input_size
,
hidden_size
,
num_layers
,
dropout_prob
,
seed
,
weight_numel
,
state_initialized
,
is_bidirec
);
rnn
.
Create
<
T
>
(
handle
,
ctx
.
GetPlace
(),
sequence_length
,
&
workspace_size
,
&
reserve_size
,
state_out
);
framework
::
Tensor
workspace_data_
;
workspace_data_
.
Resize
({
static_cast
<
int64_t
>
(
workspace_size
)});
workspace_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
auto
*
reserve_data
=
reserve
->
mutable_data
<
uint8_t
>
(
{
static_cast
<
int64_t
>
(
reserve_size
)},
ctx
.
GetPlace
());
if
(
is_test
)
{
// for inference
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardInference
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
));
if
(
sequence_length
.
empty
())
{
// for inference
// This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardInference
(
handle
,
rnn
.
rnn_desc
(),
seq_length
,
rnn
.
x_desc
(),
x_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
w_desc
(),
w_data
,
rnn
.
y_desc
(),
out_data
,
rnn
.
hy_desc
(),
last_h_data
,
rnn
.
cy_desc
(),
last_c_data
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
));
}
else
{
#if CUDNN_VERSION >= 7201
// for inference
// This interface is used when the input/output is padded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardInferenceEx
(
handle
,
rnn
.
rnn_desc
(),
rnn
.
x_seq_desc
(),
x_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
w_desc
(),
w_data
,
rnn
.
y_seq_desc
(),
out_data
,
rnn
.
hy_desc
(),
last_h_data
,
rnn
.
cy_desc
(),
last_c_data
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
));
#else
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
Unavailable
(
"The padded input is supported by "
"cudnnRNNForwardInferenceEx, but it only works when "
"the version of cudnn is larger than 7.2.1"
));
#endif
}
}
else
{
// for train
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
reserve_data
,
reserve_size
));
if
(
sequence_length
.
empty
())
{
// for train
// This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
handle
,
rnn
.
rnn_desc
(),
seq_length
,
rnn
.
x_desc
(),
x_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
w_desc
(),
w_data
,
rnn
.
y_desc
(),
out_data
,
rnn
.
hy_desc
(),
last_h_data
,
rnn
.
cy_desc
(),
last_c_data
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
reserve_data
,
reserve_size
));
}
else
{
#if CUDNN_VERSION >= 7201
// for train
// This interface is used when the input/output is padded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardTrainingEx
(
handle
,
rnn
.
rnn_desc
(),
rnn
.
x_seq_desc
(),
x_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
w_desc
(),
w_data
,
rnn
.
y_seq_desc
(),
out_data
,
rnn
.
hy_desc
(),
last_h_data
,
rnn
.
cy_desc
(),
last_c_data
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
reserve_data
,
reserve_size
));
#else
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
Unavailable
(
"The padded input is supported by "
"cudnnRNNForwardTrainingEx, but it only works when "
"the version of cudnn is larger than 7.2.1"
));
#endif
}
}
delete
cudnn_rnn_cache
;
}
};
...
...
@@ -156,44 +203,74 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
int
hidden_size
=
ctx
.
Attr
<
int
>
(
"hidden_size"
);
int
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
int
seed
=
ctx
.
Attr
<
int
>
(
"seed"
);
auto
sequence_length
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"sequence_length"
);
CudnnRNNCache
*
cudnn_rnn_cache
=
new
CudnnRNNCache
();
int
seq_length
=
input_dims
[
0
];
int
batch_size
=
input
->
dims
()[
1
];
int
input_size
=
input
->
dims
()[
2
];
int
weight_numel
=
weight
->
numel
();
auto
input_w_numel
=
weight
->
numel
();
auto
seq_len
=
input_dims
[
0
];
auto
batch_size
=
input
->
dims
()[
1
];
auto
input_dim
=
input
->
dims
()[
2
];
size_t
workspace_size
;
size_t
reserve_size
;
cudnnDataType_t
cudnn_type
=
platform
::
ToCudnnDataType
(
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
))));
cudnn_rnn_cache
->
init
(
handle
,
ctx
.
GetPlace
(),
seq_len
,
batch_size
,
input_dim
,
hidden_size
,
num_layers
,
dropout_prob
,
is_bidirec
,
seed
,
input_w_numel
,
&
reserve_size
,
const_cast
<
Tensor
*>
(
state_out
),
true
,
cudnn_type
);
auto
work_data
=
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
();
platform
::
ScopedRNNBase
rnn
(
seq_length
,
batch_size
,
input_size
,
hidden_size
,
num_layers
,
dropout_prob
,
seed
,
weight_numel
,
true
,
is_bidirec
);
rnn
.
Create
<
T
>
(
handle
,
ctx
.
GetPlace
(),
sequence_length
,
&
workspace_size
,
&
reserve_size
,
const_cast
<
Tensor
*>
(
state_out
));
framework
::
Tensor
workspace_data_
;
workspace_data_
.
Resize
({
static_cast
<
int64_t
>
(
workspace_size
)});
workspace_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
const
uint8_t
*
reserve_data
=
reserve
->
data
<
uint8_t
>
();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardData
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
seq_len
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
y_desc_
,
out_grad_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_grad_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_grad_data
,
cudnn_rnn_cache
->
w_desc_
,
weight_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
x_desc_
,
in_grad_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_grad_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_grad_data
,
work_data
,
cudnn_rnn_cache
->
workspace_size_
,
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
seq_len
,
cudnn_rnn_cache
->
x_desc_
,
input
->
data
<
T
>
(),
cudnn_rnn_cache
->
hx_desc_
,
init_h
->
data
<
T
>
(),
cudnn_rnn_cache
->
y_desc_
,
out
->
data
<
T
>
(),
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
cudnn_rnn_cache
->
w_desc_
,
weight_grad
->
data
<
T
>
(),
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
delete
cudnn_rnn_cache
;
if
(
sequence_length
.
empty
())
{
// This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardData
(
handle
,
rnn
.
rnn_desc
(),
seq_length
,
rnn
.
y_desc
(),
out_data
,
rnn
.
y_desc
(),
out_grad_data
,
rnn
.
hy_desc
(),
last_h_grad_data
,
rnn
.
cy_desc
(),
last_c_grad_data
,
rnn
.
w_desc
(),
weight_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
x_desc
(),
in_grad_data
,
rnn
.
hx_desc
(),
init_h_grad_data
,
rnn
.
cx_desc
(),
init_c_grad_data
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
handle
,
rnn
.
rnn_desc
(),
seq_length
,
rnn
.
x_desc
(),
input
->
data
<
T
>
(),
rnn
.
hx_desc
(),
init_h
->
data
<
T
>
(),
rnn
.
y_desc
(),
out
->
data
<
T
>
(),
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
rnn
.
w_desc
(),
weight_grad
->
data
<
T
>
(),
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
}
else
{
#if CUDNN_VERSION >= 7201
// for train
// This interface is used when the input/output is padded.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardDataEx
(
handle
,
rnn
.
rnn_desc
(),
rnn
.
y_seq_desc
(),
out_data
,
rnn
.
y_seq_desc
(),
out_grad_data
,
nullptr
,
nullptr
,
rnn
.
hy_desc
(),
last_h_grad_data
,
rnn
.
cy_desc
(),
last_c_grad_data
,
rnn
.
w_desc
(),
weight_data
,
rnn
.
hx_desc
(),
init_h_data
,
rnn
.
cx_desc
(),
init_c_data
,
rnn
.
x_seq_desc
(),
in_grad_data
,
rnn
.
hx_desc
(),
init_h_grad_data
,
rnn
.
cx_desc
(),
init_c_grad_data
,
nullptr
,
nullptr
,
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardWeightsEx
(
handle
,
rnn
.
rnn_desc
(),
rnn
.
x_seq_desc
(),
input
->
data
<
T
>
(),
rnn
.
hx_desc
(),
init_h
->
data
<
T
>
(),
rnn
.
y_seq_desc
(),
out
->
data
<
T
>
(),
workspace_data_
.
data
<
uint8_t
>
(),
workspace_size
,
rnn
.
w_desc
(),
weight_grad
->
data
<
T
>
(),
const_cast
<
uint8_t
*>
(
reserve_data
),
reserve_size
));
#else
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
Unavailable
(
"The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
"cudnnRNNBackwardWeightsEx, but it only works when the version "
"of cudnn is larger than 7.2.1"
));
#endif
}
}
};
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
4ff16eb2
...
...
@@ -273,11 +273,116 @@ class ScopedTensorDescriptor {
groups
);
}
inline
cudnnTensorDescriptor_t
descriptor
(
const
cudnnDataType_t
cudnn_type
,
const
std
::
vector
<
int
>&
dim
,
const
std
::
vector
<
int
>&
stride
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetTensorNdDescriptor
(
desc_
,
cudnn_type
,
dim
.
size
(),
dim
.
data
(),
stride
.
data
()));
return
desc_
;
}
template
<
typename
T
>
inline
cudnnTensorDescriptor_t
descriptor
(
const
std
::
vector
<
int
>&
dim
,
const
std
::
vector
<
int
>&
stride
)
{
return
descriptor
(
CudnnDataType
<
T
>::
type
,
dim
,
stride
);
}
private:
cudnnTensorDescriptor_t
desc_
;
DISABLE_COPY_AND_ASSIGN
(
ScopedTensorDescriptor
);
};
class
ScopedRNNTensorDescriptor
{
public:
ScopedRNNTensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateRNNDataDescriptor
(
&
desc_
));
}
~
ScopedRNNTensorDescriptor
()
PADDLE_MAY_THROW
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyRNNDataDescriptor
(
desc_
));
}
inline
cudnnRNNDataDescriptor_t
descriptor
(
const
cudnnDataType_t
cudnn_type
,
int
max_seq_length
,
int
batch_size
,
int
input_size
,
bool
time_major
,
const
std
::
vector
<
int
>&
seq_length
)
{
static
float
padding_fill
=
0.0
f
;
cudnnRNNDataLayout_t
layout
;
if
(
time_major
)
{
layout
=
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED
;
}
else
{
layout
=
CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetRNNDataDescriptor
(
desc_
,
cudnn_type
,
layout
,
max_seq_length
,
batch_size
,
input_size
,
seq_length
.
data
(),
static_cast
<
void
*>
(
&
padding_fill
)));
return
desc_
;
}
template
<
typename
T
>
inline
cudnnRNNDataDescriptor_t
descriptor
(
int
max_length
,
int
batch_size
,
int
input_size
,
bool
time_major
,
const
std
::
vector
<
int
>&
seq_length
)
{
return
descriptor
(
CudnnDataType
<
T
>::
type
,
max_length
,
batch_size
,
input_size
,
time_major
,
seq_length
);
}
private:
cudnnRNNDataDescriptor_t
desc_
;
DISABLE_COPY_AND_ASSIGN
(
ScopedRNNTensorDescriptor
);
};
class
ScopedDropoutDescriptor
{
public:
ScopedDropoutDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateDropoutDescriptor
(
&
desc_
));
}
~
ScopedDropoutDescriptor
()
PADDLE_MAY_THROW
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyDropoutDescriptor
(
desc_
));
}
inline
cudnnDropoutDescriptor_t
descriptor
(
const
cudnnHandle_t
&
handle
,
const
platform
::
Place
&
place
,
bool
initialized
,
float
dropout_prob_
,
framework
::
Tensor
*
dropout_state_
,
int
seed
,
size_t
state_size
)
{
auto
*
dropout_state_data
=
dropout_state_
->
data
<
uint8_t
>
();
if
(
!
initialized
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetDropoutDescriptor
(
desc_
,
handle
,
dropout_prob_
,
dropout_state_data
,
state_size
,
seed
));
}
else
{
auto
dropout_state_dims
=
dropout_state_
->
dims
();
state_size
=
dropout_state_dims
[
0
];
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnRestoreDropoutDescriptor
(
desc_
,
handle
,
dropout_prob_
,
dropout_state_data
,
state_size
,
0
));
}
return
desc_
;
}
private:
cudnnDropoutDescriptor_t
desc_
;
DISABLE_COPY_AND_ASSIGN
(
ScopedDropoutDescriptor
);
};
class
ScopedRNNDescriptor
{
public:
ScopedRNNDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateRNNDescriptor
(
&
desc_
));
}
~
ScopedRNNDescriptor
()
PADDLE_MAY_THROW
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyRNNDescriptor
(
desc_
));
}
inline
cudnnRNNDescriptor_t
descriptor
()
{
return
desc_
;
}
private:
cudnnRNNDescriptor_t
desc_
;
DISABLE_COPY_AND_ASSIGN
(
ScopedRNNDescriptor
);
};
class
ScopedFilterDescriptor
{
public:
ScopedFilterDescriptor
()
{
...
...
@@ -319,6 +424,167 @@ class ScopedFilterDescriptor {
DISABLE_COPY_AND_ASSIGN
(
ScopedFilterDescriptor
);
};
class
ScopedRNNBase
{
public:
ScopedRNNBase
(
int
seq_length
,
int
batch_size
,
int
input_size
,
int
hidden_size
,
int
num_layers
,
float
dropout_prob
,
int
seed
,
int
weight_numel
,
bool
initialized
,
bool
is_bidirec
)
:
seq_length_
(
seq_length
),
batch_size_
(
batch_size
),
input_size_
(
input_size
),
hidden_size_
(
hidden_size
),
num_layers_
(
num_layers
),
dropout_prob_
(
dropout_prob
),
seed_
(
seed
),
weight_numel_
(
weight_numel
),
initialized_
(
initialized
),
is_bidirec_
(
is_bidirec
)
{}
template
<
typename
T
>
void
Create
(
const
cudnnHandle_t
&
handle
,
const
platform
::
Place
&
place
,
std
::
vector
<
int
>
sequence_length
,
size_t
*
workspace_size
,
size_t
*
reserve_size
,
framework
::
Tensor
*
dropout_state
)
{
int
numDirections
=
is_bidirec_
?
2
:
1
;
cudnnDataType_t
cudnn_type
=
platform
::
CudnnDataType
<
T
>::
type
;
// ------------------- cudnn x, y descriptors ---------------------
std
::
vector
<
int
>
dims_x
=
{
batch_size_
,
input_size_
,
1
};
std
::
vector
<
int
>
strides_x
=
{
input_size_
,
1
,
1
};
std
::
vector
<
int
>
dims_y
=
{
batch_size_
,
hidden_size_
*
numDirections
,
1
};
std
::
vector
<
int
>
strides_y
=
{
hidden_size_
*
numDirections
,
1
,
1
};
for
(
int
i
=
0
;
i
<
seq_length_
;
++
i
)
{
x_desc_
.
emplace_back
(
x_d
.
descriptor
<
T
>
(
dims_x
,
strides_x
));
y_desc_
.
emplace_back
(
y_d
.
descriptor
<
T
>
(
dims_y
,
strides_y
));
}
if
(
!
sequence_length
.
empty
())
{
x_seq_desc_
=
x_seq_d
.
descriptor
<
T
>
(
seq_length_
,
batch_size_
,
input_size_
,
true
,
sequence_length
);
y_seq_desc_
=
y_seq_d
.
descriptor
<
T
>
(
seq_length_
,
batch_size_
,
hidden_size_
*
numDirections
,
true
,
sequence_length
);
}
// ------------------- cudnn hx, hy, cx, cy descriptors----------
std
::
vector
<
int
>
dims_hx
=
{
num_layers_
*
numDirections
,
batch_size_
,
hidden_size_
};
std
::
vector
<
int
>
strides_hx
=
{
hidden_size_
*
batch_size_
,
hidden_size_
,
1
};
hx_desc_
=
hx_d
.
descriptor
<
T
>
(
dims_hx
,
strides_hx
);
cx_desc_
=
cx_d
.
descriptor
<
T
>
(
dims_hx
,
strides_hx
);
hy_desc_
=
hy_d
.
descriptor
<
T
>
(
dims_hx
,
strides_hx
);
cy_desc_
=
cy_d
.
descriptor
<
T
>
(
dims_hx
,
strides_hx
);
// ------------------- cudnn dropout descriptors ---------------------
size_t
state_size
;
if
(
!
initialized_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDropoutGetStatesSize
(
handle
,
&
state_size
));
dropout_state
->
mutable_data
<
uint8_t
>
({
static_cast
<
int64_t
>
(
state_size
)},
place
);
}
dropout_desc_
=
dropout_d
.
descriptor
(
handle
,
place
,
initialized_
,
dropout_prob_
,
dropout_state
,
seed_
,
state_size
);
// ------------------- cudnn rnn descriptors ---------------------
rnn_desc_
=
rnn_d
.
descriptor
();
#if CUDNN_VERSION >= 6000
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetRNNDescriptor_v6
(
handle
,
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
CUDNN_RNN_ALGO_STANDARD
,
cudnn_type
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetRNNDescriptor
(
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
cudnn_type
));
#endif
if
(
!
sequence_length
.
empty
())
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetRNNPaddingMode
(
rnn_desc_
,
CUDNN_RNN_PADDED_IO_ENABLED
));
}
// ------------------- cudnn weights_size ---------------------
size_t
weights_size_
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNParamsSize
(
handle
,
rnn_desc_
,
x_desc_
[
0
],
&
weights_size_
,
cudnn_type
));
PADDLE_ENFORCE_EQ
(
weights_size_
,
sizeof
(
T
)
*
weight_numel_
,
platform
::
errors
::
InvalidArgument
(
"The cudnn lstm and setting weight size should be same."
));
// ------------------- cudnn weight descriptors ---------------------
platform
::
DataLayout
layout
=
platform
::
DataLayout
::
kNCHW
;
int
dim_tmp
=
weights_size_
/
sizeof
(
T
);
std
::
vector
<
int
>
dim_w
=
{
dim_tmp
,
1
,
1
};
w_desc_
=
w_d
.
descriptor
<
T
>
(
layout
,
dim_w
);
// ------------------- cudnn workspace, reserve size ---------------------
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNWorkspaceSize
(
handle
,
rnn_desc_
,
seq_length_
,
x_desc_
.
data
(),
workspace_size
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNTrainingReserveSize
(
handle
,
rnn_desc_
,
seq_length_
,
x_desc_
.
data
(),
reserve_size
));
}
cudnnTensorDescriptor_t
*
x_desc
()
{
return
x_desc_
.
data
();
}
cudnnTensorDescriptor_t
*
y_desc
()
{
return
y_desc_
.
data
();
}
cudnnRNNDataDescriptor_t
x_seq_desc
()
{
return
x_seq_desc_
;
}
cudnnRNNDataDescriptor_t
y_seq_desc
()
{
return
y_seq_desc_
;
}
cudnnTensorDescriptor_t
hx_desc
()
{
return
hx_desc_
;
}
cudnnTensorDescriptor_t
cx_desc
()
{
return
cx_desc_
;
}
cudnnTensorDescriptor_t
hy_desc
()
{
return
hy_desc_
;
}
cudnnTensorDescriptor_t
cy_desc
()
{
return
cy_desc_
;
}
cudnnRNNDescriptor_t
rnn_desc
()
{
return
rnn_desc_
;
}
cudnnDropoutDescriptor_t
dropout_desc
()
{
return
dropout_desc_
;
}
cudnnFilterDescriptor_t
w_desc
()
{
return
w_desc_
;
}
private:
int
seq_length_
;
int
batch_size_
;
int
input_size_
;
int
hidden_size_
;
int
num_layers_
;
float
dropout_prob_
;
int
seed_
;
int
weight_numel_
;
bool
initialized_
;
bool
is_bidirec_
;
std
::
vector
<
cudnnTensorDescriptor_t
>
x_desc_
;
std
::
vector
<
cudnnTensorDescriptor_t
>
y_desc_
;
cudnnRNNDataDescriptor_t
x_seq_desc_
;
cudnnRNNDataDescriptor_t
y_seq_desc_
;
// A tensor descriptor describing the initial hidden state of the RNN.
cudnnTensorDescriptor_t
hx_desc_
;
// A tensor descriptor describing the initial cell state for LSTM networks.
cudnnTensorDescriptor_t
cx_desc_
;
// A tensor descriptor describing the final hidden state of the RNN.
cudnnTensorDescriptor_t
hy_desc_
;
// A tensor descriptor describing the final cell state for LSTM networks.
cudnnTensorDescriptor_t
cy_desc_
;
cudnnDropoutDescriptor_t
dropout_desc_
;
cudnnFilterDescriptor_t
w_desc_
;
cudnnRNNDescriptor_t
rnn_desc_
;
ScopedTensorDescriptor
x_d
;
ScopedTensorDescriptor
y_d
;
ScopedRNNTensorDescriptor
x_seq_d
;
ScopedRNNTensorDescriptor
y_seq_d
;
ScopedTensorDescriptor
hx_d
;
ScopedTensorDescriptor
cx_d
;
ScopedTensorDescriptor
hy_d
;
ScopedTensorDescriptor
cy_d
;
ScopedDropoutDescriptor
dropout_d
;
ScopedFilterDescriptor
w_d
;
ScopedRNNDescriptor
rnn_d
;
};
class
ScopedConvolutionDescriptor
{
public:
ScopedConvolutionDescriptor
()
{
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
4ff16eb2
...
...
@@ -101,6 +101,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnRestoreDropoutDescriptor); \
__macro(cudnnCreateRNNDataDescriptor); \
__macro(cudnnDestroyRNNDataDescriptor); \
__macro(cudnnSetRNNDataDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
...
...
@@ -109,6 +112,11 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnRNNForwardTrainingEx); \
__macro(cudnnSetRNNPaddingMode); \
__macro(cudnnRNNBackwardDataEx); \
__macro(cudnnRNNBackwardWeightsEx); \
__macro(cudnnRNNForwardInferenceEx); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetTensorNdDescriptorEx);
...
...
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
浏览文件 @
4ff16eb2
...
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
import
math
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
...
...
@@ -27,120 +28,372 @@ SIGMOID_THRESHOLD_MAX = 13.0
EXP_MAX_INPUT
=
40.0
def
lstm_naive
(
input
,
w
):
seq_len
,
batch_size
,
hidden_size
=
input
.
shape
offset
=
0
wi
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wo
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ri
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ro
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
bi_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bi_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_2
=
w
[
offset
:
offset
+
hidden_size
]
def
sigmoid
(
x
):
y
=
np
.
copy
(
x
)
y
[
x
<
SIGMOID_THRESHOLD_MIN
]
=
SIGMOID_THRESHOLD_MIN
y
[
x
>
SIGMOID_THRESHOLD_MAX
]
=
SIGMOID_THRESHOLD_MAX
return
1.
/
(
1.
+
np
.
exp
(
-
y
))
def
tanh
(
x
):
y
=
-
2.
*
x
y
[
y
>
EXP_MAX_INPUT
]
=
EXP_MAX_INPUT
return
(
2.
/
(
1.
+
np
.
exp
(
y
)))
-
1.
output
=
[]
pre_h
=
np
.
zeros
((
1
,
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
pre_c
=
np
.
zeros
((
1
,
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
for
i
in
range
(
seq_len
):
emb_1
=
input
[
i
]
input_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wi
)
+
np
.
matmul
(
pre_h
,
ri
)
+
bi_1
+
bi_2
)
forget_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wf
)
+
np
.
matmul
(
pre_h
,
rf
)
+
bf_1
+
bf_2
)
output_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wo
)
+
np
.
matmul
(
pre_h
,
ro
)
+
bo_1
+
bo_2
)
c_t_temp
=
tanh
(
np
.
matmul
(
emb_1
,
wc
)
+
np
.
matmul
(
pre_h
,
rc
)
+
bc_1
+
bc_2
)
new_c
=
input_gate
*
c_t_temp
+
forget_gate
*
pre_c
new_h
=
output_gate
*
tanh
(
new_c
)
pre_h
=
new_h
pre_c
=
new_c
output
.
append
(
new_h
)
output
=
np
.
concatenate
(
output
,
-
1
)
output
=
output
.
reshape
((
batch_size
,
-
1
,
hidden_size
))
output
=
output
.
transpose
((
1
,
0
,
2
))
return
output
,
pre_h
,
pre_c
class
LayerMixin
(
object
):
def
__call__
(
self
,
*
args
,
**
kwargs
):
return
self
.
forward
(
*
args
,
**
kwargs
)
class
LayerListMixin
(
LayerMixin
):
def
__init__
(
self
,
layers
=
None
):
self
.
_layers
=
list
(
layers
)
if
layers
else
[]
def
append
(
self
,
layer
):
self
.
_layers
.
append
(
layer
)
def
__iter__
(
self
):
return
iter
(
self
.
_layers
)
class
LSTMCell
(
LayerMixin
):
def
__init__
(
self
,
input_size
,
hidden_size
,
bias
=
True
):
self
.
input_size
=
input_size
self
.
hidden_size
=
hidden_size
self
.
bias
=
bias
self
.
dtype
=
np
.
float64
self
.
parameters
=
dict
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_ih
=
np
.
ones
(
(
4
*
hidden_size
,
input_size
),
dtype
=
self
.
dtype
)
self
.
weight_hh
=
np
.
ones
((
4
*
hidden_size
,
hidden_size
)).
astype
(
self
.
dtype
)
self
.
parameters
[
'weight_ih'
]
=
self
.
weight_ih
self
.
parameters
[
'weight_hh'
]
=
self
.
weight_hh
if
bias
:
self
.
bias_ih
=
np
.
ones
((
4
*
hidden_size
)).
astype
(
self
.
dtype
)
self
.
bias_hh
=
np
.
ones
((
4
*
hidden_size
)).
astype
(
self
.
dtype
)
self
.
parameters
[
'bias_ih'
]
=
self
.
bias_ih
self
.
parameters
[
'bias_hh'
]
=
self
.
bias_hh
else
:
self
.
bias_ih
=
None
self
.
bias_hh
=
None
def
init_state
(
self
,
inputs
):
batch_size
=
inputs
.
shape
[
0
]
init_h
=
np
.
zeros
((
batch_size
,
self
.
hidden_size
),
dtype
=
inputs
.
dtype
)
init_c
=
np
.
zeros
((
batch_size
,
self
.
hidden_size
),
dtype
=
inputs
.
dtype
)
return
init_h
,
init_c
def
forward
(
self
,
inputs
,
hx
=
None
):
if
hx
is
None
:
hx
=
self
.
init_state
(
inputs
)
pre_hidden
,
pre_cell
=
hx
gates
=
np
.
matmul
(
inputs
,
self
.
weight_ih
.
T
)
if
self
.
bias_ih
is
not
None
:
gates
=
gates
+
self
.
bias_ih
gates
+=
np
.
matmul
(
pre_hidden
,
self
.
weight_hh
.
T
)
if
self
.
bias_hh
is
not
None
:
gates
=
gates
+
self
.
bias_hh
chunked_gates
=
np
.
split
(
gates
,
4
,
-
1
)
i
=
1.0
/
(
1.0
+
np
.
exp
(
-
chunked_gates
[
0
]))
f
=
1.0
/
(
1.0
+
np
.
exp
(
-
chunked_gates
[
1
]))
o
=
1.0
/
(
1.0
+
np
.
exp
(
-
chunked_gates
[
3
]))
c
=
f
*
pre_cell
+
i
*
np
.
tanh
(
chunked_gates
[
2
])
h
=
o
*
np
.
tanh
(
c
)
return
h
,
(
h
,
c
)
def
sequence_mask
(
lengths
,
max_len
=
None
):
if
max_len
is
None
:
max_len
=
np
.
max
(
lengths
)
else
:
assert
max_len
>=
np
.
max
(
lengths
)
return
np
.
arange
(
max_len
)
<
np
.
expand_dims
(
lengths
,
-
1
)
def
update_state
(
mask
,
new
,
old
):
if
not
isinstance
(
old
,
(
tuple
,
list
)):
return
np
.
where
(
mask
,
new
,
old
)
else
:
return
tuple
(
map
(
lambda
x
,
y
:
np
.
where
(
mask
,
x
,
y
),
new
,
old
))
def
rnn
(
cell
,
inputs
,
initial_states
,
sequence_length
=
None
,
time_major
=
False
,
is_reverse
=
False
):
if
not
time_major
:
inputs
=
np
.
transpose
(
inputs
,
[
1
,
0
,
2
])
if
is_reverse
:
inputs
=
np
.
flip
(
inputs
,
0
)
if
sequence_length
is
None
:
mask
=
None
else
:
mask
=
np
.
transpose
(
sequence_mask
(
sequence_length
),
[
1
,
0
])
mask
=
np
.
expand_dims
(
mask
,
-
1
)
if
is_reverse
:
mask
=
np
.
flip
(
mask
,
0
)
time_steps
=
inputs
.
shape
[
0
]
state
=
initial_states
outputs
=
[]
for
t
in
range
(
time_steps
):
x_t
=
inputs
[
t
]
if
mask
is
not
None
:
m_t
=
mask
[
t
]
y
,
new_state
=
cell
(
x_t
,
state
)
y
=
np
.
where
(
m_t
,
y
,
0.
)
outputs
.
append
(
y
)
state
=
update_state
(
m_t
,
new_state
,
state
)
else
:
y
,
new_state
=
cell
(
x_t
,
state
)
outputs
.
append
(
y
)
state
=
new_state
outputs
=
np
.
stack
(
outputs
)
final_state
=
state
if
is_reverse
:
outputs
=
np
.
flip
(
outputs
,
0
)
if
not
time_major
:
outputs
=
np
.
transpose
(
outputs
,
[
1
,
0
,
2
])
return
outputs
,
final_state
def
birnn
(
cell_fw
,
cell_bw
,
inputs
,
initial_states
,
sequence_length
=
None
,
time_major
=
False
):
states_fw
,
states_bw
=
initial_states
outputs_fw
,
states_fw
=
rnn
(
cell_fw
,
inputs
,
states_fw
,
sequence_length
,
time_major
=
time_major
)
outputs_bw
,
states_bw
=
rnn
(
cell_bw
,
inputs
,
states_bw
,
sequence_length
,
time_major
=
time_major
,
is_reverse
=
True
)
outputs
=
np
.
concatenate
((
outputs_fw
,
outputs_bw
),
-
1
)
final_states
=
(
states_fw
,
states_bw
)
return
outputs
,
final_states
def
flatten
(
nested
):
return
list
(
_flatten
(
nested
))
def
_flatten
(
nested
):
for
item
in
nested
:
if
isinstance
(
item
,
(
list
,
tuple
)):
for
subitem
in
_flatten
(
item
):
yield
subitem
else
:
yield
item
def
unstack
(
array
,
axis
=
0
):
num
=
array
.
shape
[
axis
]
sub_arrays
=
np
.
split
(
array
,
num
,
axis
)
return
[
np
.
squeeze
(
sub_array
,
axis
)
for
sub_array
in
sub_arrays
]
def
dropout
(
array
,
p
=
0.0
):
if
p
==
0.0
:
return
array
mask
=
(
np
.
random
.
uniform
(
size
=
array
.
shape
)
<
(
1
-
p
)).
astype
(
array
.
dtype
)
return
array
*
(
mask
/
(
1
-
p
))
def
split_states
(
states
,
bidirectional
=
False
,
state_components
=
1
):
if
state_components
==
1
:
states
=
unstack
(
states
)
if
not
bidirectional
:
return
states
else
:
return
list
(
zip
(
states
[::
2
],
states
[
1
::
2
]))
else
:
assert
len
(
states
)
==
state_components
states
=
tuple
([
unstack
(
item
)
for
item
in
states
])
if
not
bidirectional
:
return
list
(
zip
(
*
states
))
else
:
states
=
list
(
zip
(
*
states
))
return
list
(
zip
(
states
[::
2
],
states
[
1
::
2
]))
def
concat_states
(
states
,
bidirectional
=
False
,
state_components
=
1
):
if
state_components
==
1
:
return
np
.
stack
(
flatten
(
states
))
else
:
states
=
flatten
(
states
)
componnets
=
[]
for
i
in
range
(
state_components
):
componnets
.
append
(
states
[
i
::
state_components
])
return
[
np
.
stack
(
item
)
for
item
in
componnets
]
class
RNN
(
LayerMixin
):
def
__init__
(
self
,
cell
,
is_reverse
=
False
,
time_major
=
False
):
super
(
RNN
,
self
).
__init__
()
self
.
cell
=
cell
if
not
hasattr
(
self
.
cell
,
"call"
):
# for non-dygraph mode, `rnn` api uses cell.call
self
.
cell
.
call
=
self
.
cell
.
forward
self
.
is_reverse
=
is_reverse
self
.
time_major
=
time_major
def
forward
(
self
,
inputs
,
initial_states
=
None
,
sequence_length
=
None
):
final_outputs
,
final_states
=
rnn
(
self
.
cell
,
inputs
,
initial_states
=
initial_states
,
sequence_length
=
sequence_length
,
time_major
=
self
.
time_major
,
is_reverse
=
self
.
is_reverse
)
return
final_outputs
,
final_states
class
BiRNN
(
LayerMixin
):
def
__init__
(
self
,
cell_fw
,
cell_bw
,
time_major
=
False
):
super
(
BiRNN
,
self
).
__init__
()
self
.
cell_fw
=
cell_fw
self
.
cell_bw
=
cell_bw
self
.
time_major
=
time_major
def
forward
(
self
,
inputs
,
initial_states
=
None
,
sequence_length
=
None
,
**
kwargs
):
if
isinstance
(
initial_states
,
(
list
,
tuple
)):
assert
len
(
initial_states
)
==
2
,
\
"length of initial_states should be 2 when it is a list/tuple"
else
:
initial_states
=
[
initial_states
,
initial_states
]
outputs
,
final_states
=
birnn
(
self
.
cell_fw
,
self
.
cell_bw
,
inputs
,
initial_states
,
sequence_length
,
self
.
time_major
)
return
outputs
,
final_states
class
RNNMixin
(
LayerListMixin
):
def
forward
(
self
,
inputs
,
initial_states
=
None
,
sequence_length
=
None
):
batch_index
=
1
if
self
.
time_major
else
0
batch_size
=
inputs
.
shape
[
batch_index
]
dtype
=
inputs
.
dtype
if
initial_states
is
None
:
state_shape
=
(
self
.
num_layers
*
self
.
num_directions
,
batch_size
,
self
.
hidden_size
)
if
self
.
state_components
==
1
:
initial_states
=
np
.
zeros
(
state_shape
,
dtype
)
else
:
initial_states
=
tuple
([
np
.
zeros
(
state_shape
,
dtype
)
for
_
in
range
(
self
.
state_components
)
])
states
=
split_states
(
initial_states
,
self
.
num_directions
==
2
,
self
.
state_components
)
final_states
=
[]
for
i
,
rnn_layer
in
enumerate
(
self
):
if
i
>
0
:
inputs
=
dropout
(
inputs
,
self
.
dropout
)
outputs
,
final_state
=
rnn_layer
(
inputs
,
states
[
i
],
sequence_length
)
final_states
.
append
(
final_state
)
inputs
=
outputs
final_states
=
concat_states
(
final_states
,
self
.
num_directions
==
2
,
self
.
state_components
)
return
outputs
,
final_states
class
LSTM
(
RNNMixin
):
def
__init__
(
self
,
input_size
,
hidden_size
,
num_layers
=
1
,
direction
=
"forward"
,
dropout
=
0.
,
time_major
=
False
):
super
(
LSTM
,
self
).
__init__
()
if
direction
in
[
"forward"
,
"backward"
]:
is_reverse
=
direction
==
"backward"
cell
=
LSTMCell
(
input_size
,
hidden_size
)
self
.
append
(
RNN
(
cell
,
is_reverse
,
time_major
))
for
i
in
range
(
1
,
num_layers
):
cell
=
LSTMCell
(
hidden_size
,
hidden_size
)
self
.
append
(
RNN
(
cell
,
is_reverse
,
time_major
))
elif
direction
==
"bidirectional"
:
cell_fw
=
LSTMCell
(
input_size
,
hidden_size
)
cell_bw
=
LSTMCell
(
input_size
,
hidden_size
)
self
.
append
(
BiRNN
(
cell_fw
,
cell_bw
,
time_major
))
for
i
in
range
(
1
,
num_layers
):
cell_fw
=
LSTMCell
(
2
*
hidden_size
,
hidden_size
)
cell_bw
=
LSTMCell
(
2
*
hidden_size
,
hidden_size
)
self
.
append
(
BiRNN
(
cell_fw
,
cell_bw
,
time_major
))
else
:
raise
ValueError
(
"direction should be forward, backward or bidirectional, "
"received direction = {}"
.
format
(
direction
))
self
.
input_size
=
input_size
self
.
hidden_size
=
hidden_size
self
.
dropout
=
dropout
self
.
num_directions
=
2
if
direction
==
"bidirectional"
else
1
self
.
time_major
=
time_major
self
.
num_layers
=
num_layers
self
.
state_components
=
2
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestCUDNNLstmOp
(
OpTest
):
#
TODO(GaoWei8):when input dtype is fp64, precision threshold should be removed.
#
TODO(GaoWei8): Need to satisfy the result through the new interface
def
setUp
(
self
):
self
.
op_type
=
"cudnn_lstm"
self
.
dtype
=
np
.
float64
self
.
sequence_length
=
np
.
array
([
12
,
11
,
10
,
9
,
8
],
dtype
=
np
.
int32
)
self
.
num_layers
=
1
seq_length
=
20
seq_length
=
12
batch_size
=
5
hidden_size
=
20
input_size
=
21
hidden_size
=
21
input_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
hidden_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
weight_size
=
input_weight_size
+
hidden_weight_size
weight_size
+=
hidden_size
*
8
weight_size
*=
self
.
num_layers
input
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
seq_length
,
batch_size
,
hidden_size
)).
astype
(
self
.
dtype
)
flat_w
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
weight_size
)).
astype
(
self
.
dtype
)
output
,
last_hidden
,
last_cell
=
lstm_naive
(
input
,
flat_w
)
init_h
=
np
.
zeros
((
1
,
batch_size
,
hidden_size
),
dtype
=
np
.
float64
)
init_c
=
np
.
zeros
((
1
,
batch_size
,
hidden_size
),
dtype
=
np
.
float64
)
low
=-
0.1
,
high
=
0.1
,
size
=
(
seq_length
,
batch_size
,
input_size
)).
astype
(
self
.
dtype
)
input
[
11
][
1
:][:]
=
0
input
[
10
][
2
:][:]
=
0
input
[
9
][
3
:][:]
=
0
input
[
8
][
4
:][:]
=
0
rnn1
=
LSTM
(
input_size
,
hidden_size
,
self
.
num_layers
,
time_major
=
True
,
direction
=
"forward"
)
output
,
(
last_hidden
,
last_cell
)
=
rnn1
(
input
,
sequence_length
=
self
.
sequence_length
)
flat_w
=
np
.
ones
((
weight_size
)).
astype
(
self
.
dtype
)
init_h
=
np
.
zeros
((
self
.
num_layers
,
batch_size
,
hidden_size
)).
astype
(
self
.
dtype
)
init_c
=
np
.
zeros
((
self
.
num_layers
,
batch_size
,
hidden_size
)).
astype
(
self
.
dtype
)
state_out
=
np
.
ndarray
((
300
)).
astype
(
"uint8"
)
self
.
inputs
=
{
...
...
@@ -152,9 +405,10 @@ class TestCUDNNLstmOp(OpTest):
self
.
attrs
=
{
'dropout_prob'
:
0.0
,
'is_bidirec'
:
False
,
'input_size'
:
hidden
_size
,
'input_size'
:
input
_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
1
,
'sequence_length'
:
self
.
sequence_length
.
tolist
()
}
self
.
outputs
=
{
'Out'
:
output
,
...
...
@@ -164,19 +418,33 @@ class TestCUDNNLstmOp(OpTest):
'StateOut'
:
state_out
}
def
set_attrs
(
self
):
pass
def
test_output_with_place
(
self
):
# depend on the scope structure
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
no_check_set
=
[
'Reserve'
,
'StateOut'
])
def
test_grad_with_place
(
self
):
# depend on the scope structure
place
=
core
.
CUDAPlace
(
0
)
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'W'
,
'InitH'
,
'InitC'
]),
[
'Out'
,
'LastH'
,
'LastC'
],
max_relative_error
=
1e-4
)
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'W'
,
'InitH'
,
'InitC'
]),
[
'Out'
,
'LastH'
,
'LastC'
])
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestCUDNNLstmOp2
(
TestCUDNNLstmOp
):
def
set_attrs
(
self
):
self
.
sequence_length
=
np
.
array
([],
dtype
=
np
.
int32
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestCUDNNLstmOp3
(
TestCUDNNLstmOp
):
def
set_attrs
(
self
):
self
.
num_layers
=
2
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -198,7 +466,7 @@ class TestCUDNNlstmAPI(unittest.TestCase):
'float64'
,
0.0
)
rnn_out
,
last_h
,
last_c
=
layers
.
lstm
(
input
,
init_h
,
init_c
,
seq_len
,
hidden_size
,
num_layers
,
dropout_prob
)
dropout_prob
,
False
,
True
)
exe
=
fluid
.
Executor
(
fluid
.
CUDAPlace
(
0
))
exe
.
run
(
fluid
.
default_startup_program
())
input_i
=
np
.
random
.
uniform
(
...
...
@@ -208,12 +476,6 @@ class TestCUDNNlstmAPI(unittest.TestCase):
feed
=
{
'input'
:
input_i
},
fetch_list
=
[
rnn_out
,
last_h
,
last_c
,
'cudnn_lstm_0.w_0'
])
output
,
last_hidden
,
last_cell
=
lstm_naive
(
input_i
,
out
[
3
])
self
.
assertTrue
(
np
.
allclose
(
output
,
out
[
0
],
atol
=
1e-5
))
self
.
assertTrue
(
np
.
allclose
(
last_hidden
,
out
[
1
],
atol
=
1e-5
))
self
.
assertTrue
(
np
.
allclose
(
last_cell
,
out
[
2
],
atol
=
1e-5
))
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录