Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4f71a6ee
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4f71a6ee
编写于
12月 03, 2018
作者:
H
Hongyu Liu
提交者:
GitHub
12月 03, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #14622 from PaddlePaddle/add_cudnn_lstm
Add cudnn lstm
上级
c7382df8
9f7eae86
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
1102 addition
and
4 deletion
+1102
-4
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-0
paddle/fluid/operators/cudnn_lstm_op.cc
paddle/fluid/operators/cudnn_lstm_op.cc
+218
-0
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+485
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+17
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+163
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+20
-2
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+192
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+6
-1
未找到文件。
paddle/fluid/API.spec
浏览文件 @
4f71a6ee
...
...
@@ -194,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/operators/cudnn_lstm_op.cc
0 → 100644
浏览文件 @
4f71a6ee
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
class
CudnnLSTMOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input(Weight) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitH"
),
"Input(init_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitC"
),
"Input(init_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Cache"
),
"Input(Cache) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"last_h"
),
"Output(last_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"last_c"
),
"Output(last_c) of LSTM should not be null."
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
3
,
"Input(X)'s rank must be 3."
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"Input"
));
ctx
->
SetOutputDim
(
"last_h"
,
ctx
->
GetInputDim
(
"InitH"
));
ctx
->
SetOutputDim
(
"last_c"
,
ctx
->
GetInputDim
(
"InitC"
));
}
};
class
CudnnLSTMOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Input"
,
"(Tensor) RNN input tensor, which support variable-time length input "
"sequence."
"The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
"seq_len is the total time step in this mini-batch (CAN be change in "
"different batch)"
"batch_size is the instance number of this batch"
"input_size is the hidden size of the input."
"input_hidden_size and the hidden_size in the next may not be same"
);
AddInput
(
"InitH"
,
"(Tensor) the initial hidden state of the LSTM"
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddInput
(
"InitC"
,
"(Tensor) the initial cell state of the LSTm "
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddInput
(
"W"
,
"(Tensor) the learnable hidden-hidden weights."
" The shape is (N), where N is total weight size of the LSTM. "
" cudnn concatenate all the weight to one Tensor"
);
AddInput
(
"Cache"
,
"The cache of dropout op, a RAW type variable including random "
"number generator states and some descriptors, which is used in "
"cudnn kernel."
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"(Tensor) the hidden state of LSTM operator. "
"The shape is ( seq_len x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be ( seq_len x "
"batch_size x hidden_size * 2) "
);
AddOutput
(
"last_h"
,
"(Tensor) the hidden state of the last step. "
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddOutput
(
"last_c"
,
"(Tensor) the cell state of the last step"
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirect is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size*2)"
);
AddAttr
<
int
>
(
"max_len"
,
"max length of the LSTM op"
"the first dim of the Input can NOT be greater than max_len"
)
.
SetDefault
(
20
);
AddAttr
<
float
>
(
"dropout_prob"
,
"dropout prob of the dropout op"
"the dropout ONLY work between lstm layers, not between time steps"
"There is no dropout work on the Out tensor"
)
.
SetDefault
(
0.0
);
AddAttr
<
bool
>
(
"is_bidirec"
,
"is_bidirec"
"if it is bidirection rnn"
"The will affect the shape of the Out, last_h, and last_c"
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"input_size"
,
"input size ot the Input Tensor"
).
SetDefault
(
10
);
AddAttr
<
int
>
(
"hidden_size"
,
"hidden size of the LSTM"
).
SetDefault
(
100
);
AddAttr
<
int
>
(
"num_layers"
,
"the total layer number of the LSTM"
)
.
SetDefault
(
1
);
AddAttr
<
bool
>
(
"is_test"
,
"True if in test phase."
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"seed"
,
"seed to used if fix_seed is True"
).
SetDefault
(
-
1
);
AddComment
(
R"DOC(
CUDNN LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
)DOC"
);
}
};
class
CudnnLSTMGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input(W) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"last_h"
),
"Input(last_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"last_c"
),
"Input(last_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Cache"
),
"Input(last_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitH"
),
"Input(init_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitC"
),
"Input(init_c) of LSTM should not be null."
);
auto
SetOutGradDim
=
[
&
ctx
](
const
std
::
string
&
name
)
{
auto
g_name
=
framework
::
GradVarName
(
name
);
if
(
ctx
->
HasOutput
(
g_name
))
{
ctx
->
SetOutputDim
(
g_name
,
ctx
->
GetInputDim
(
name
));
}
};
SetOutGradDim
(
"Input"
);
SetOutGradDim
(
"W"
);
SetOutGradDim
(
"InitH"
);
SetOutGradDim
(
"InitC"
);
}
};
template
<
typename
T
>
class
NotImpleKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
"CPU is not support for this kernel now. Will be add in the future"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
cudnn_lstm
,
ops
::
CudnnLSTMOp
,
ops
::
CudnnLSTMOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
cudnn_lstm_grad
,
ops
::
CudnnLSTMGradOp
);
REGISTER_OP_CPU_KERNEL
(
cudnn_lstm
,
ops
::
NotImpleKernel
<
float
>
);
REGISTER_OP_CPU_KERNEL
(
cudnn_lstm_grad
,
ops
::
NotImpleKernel
<
float
>
);
paddle/fluid/operators/cudnn_lstm_op.cu.cc
0 → 100644
浏览文件 @
4f71a6ee
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
struct
CudnnRNNCache
{
CudnnRNNCache
()
{
x_desc_
=
NULL
;
y_desc_
=
NULL
;
dx_desc_
=
NULL
;
dy_desc_
=
NULL
;
}
~
CudnnRNNCache
()
{
release
();
}
cudnnRNNDescriptor_t
rnn_desc_
;
cudnnTensorDescriptor_t
*
x_desc_
;
cudnnTensorDescriptor_t
*
y_desc_
;
cudnnTensorDescriptor_t
*
dx_desc_
;
cudnnTensorDescriptor_t
*
dy_desc_
;
cudnnTensorDescriptor_t
hx_desc_
;
cudnnTensorDescriptor_t
cx_desc_
;
cudnnTensorDescriptor_t
hy_desc_
;
cudnnTensorDescriptor_t
cy_desc_
;
cudnnTensorDescriptor_t
dhx_desc_
;
cudnnTensorDescriptor_t
dcx_desc_
;
cudnnTensorDescriptor_t
dhy_desc_
;
cudnnTensorDescriptor_t
dcy_desc_
;
cudnnTensorDescriptor_t
output_x_desc_
;
cudnnTensorDescriptor_t
output_y_desc_
;
cudnnDropoutDescriptor_t
dropout_desc_
;
size_t
weights_size_
;
cudnnFilterDescriptor_t
w_desc_
;
cudnnFilterDescriptor_t
dw_desc_
;
size_t
workspace_size_
;
size_t
reserve_size_
;
Tensor
reserve_data_
;
Tensor
workspace_data_
;
Tensor
dropout_state_
;
size_t
max_length_
;
float
dropout_prob_
;
bool
is_bidirec_
;
int
batch_size_
;
int
input_size_
;
int
hidden_size_
;
int
num_layers_
;
int
seed_
;
void
init
(
cudnnHandle_t
handle
,
const
framework
::
ExecutionContext
&
ctx
,
size_t
max_len
,
int
batch_size
,
int
input_size
,
int
hidden_size
,
int
num_layers
,
float
dropout_prob
,
bool
is_bidirec
,
int
seed
,
int
weight_numel
)
{
max_length_
=
max_len
;
batch_size_
=
batch_size
;
input_size_
=
input_size
;
hidden_size_
=
hidden_size
;
num_layers_
=
num_layers
;
dropout_prob_
=
dropout_prob
;
is_bidirec_
=
is_bidirec
;
seed_
=
seed
;
x_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
y_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
dx_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
dy_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
int
dim_a
[
3
];
int
stride_a
[
3
];
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dy_desc_
[
i
]));
dim_a
[
0
]
=
batch_size_
;
dim_a
[
1
]
=
input_size_
;
dim_a
[
2
]
=
1
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
x_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dx_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
dim_a
[
0
]
=
batch_size_
;
dim_a
[
1
]
=
is_bidirec_
?
hidden_size_
*
2
:
hidden_size_
;
dim_a
[
2
]
=
1
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
y_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dy_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
}
dim_a
[
0
]
=
num_layers_
*
(
is_bidirec_
?
2
:
1
);
dim_a
[
1
]
=
batch_size_
;
dim_a
[
2
]
=
hidden_size_
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateDropoutDescriptor
(
&
dropout_desc_
));
size_t
state_size
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDropoutGetStatesSize
(
handle
,
&
state_size
);
dropout_state_
.
Resize
({
static_cast
<
int64_t
>
(
state_size
)}));
auto
*
dropout_state_data
=
dropout_state_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetDropoutDescriptor
(
dropout_desc_
,
handle
,
dropout_prob_
,
dropout_state_data
,
state_size
,
seed_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateRNNDescriptor
(
&
rnn_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetRNNDescriptor_v6
(
handle
,
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
CUDNN_RNN_ALGO_STANDARD
,
CUDNN_DATA_FLOAT
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
dw_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNParamsSize
(
handle
,
rnn_desc_
,
x_desc_
[
0
],
&
weights_size_
,
CUDNN_DATA_FLOAT
));
PADDLE_ENFORCE_EQ
(
weights_size_
,
sizeof
(
float
)
*
weight_numel
,
"cudnn lstm weight size should be SAME"
);
int
dim_w
[
3
];
dim_w
[
0
]
=
weights_size_
/
sizeof
(
float
);
dim_w
[
1
]
=
1
;
dim_w
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
w_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
dw_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNWorkspaceSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
workspace_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNTrainingReserveSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
reserve_size_
));
reserve_data_
.
Resize
({
static_cast
<
int64_t
>
(
reserve_size_
)});
reserve_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
workspace_data_
.
Resize
({
static_cast
<
int64_t
>
(
workspace_size_
)});
workspace_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
}
void
release
()
{
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dy_desc_
[
i
]));
}
delete
[]
x_desc_
;
delete
[]
y_desc_
;
delete
[]
dx_desc_
;
delete
[]
dy_desc_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyDropoutDescriptor
(
dropout_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyRNNDescriptor
(
rnn_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
dw_desc_
));
}
};
template
<
typename
T
>
class
CudnnLSTMGPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
x
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
init_h
=
ctx
.
Input
<
Tensor
>
(
"InitH"
);
const
Tensor
*
init_c
=
ctx
.
Input
<
Tensor
>
(
"InitC"
);
auto
w
=
ctx
.
Input
<
Tensor
>
(
"W"
);
Tensor
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
Tensor
*
last_h
=
ctx
.
Output
<
Tensor
>
(
"last_h"
);
Tensor
*
last_c
=
ctx
.
Output
<
Tensor
>
(
"last_c"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
init_h_data
=
init_h
->
data
<
T
>
();
const
T
*
init_c_data
=
init_c
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
last_h_data
=
last_h
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
last_c_data
=
last_c
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
size_t
max_len
=
ctx
.
Attr
<
int
>
(
"max_len"
);
float
dropout_prob
=
ctx
.
Attr
<
float
>
(
"dropout_prob"
);
bool
is_bidirec
=
ctx
.
Attr
<
bool
>
(
"is_bidirec"
);
int
input_size
=
ctx
.
Attr
<
int
>
(
"input_size"
);
int
hidden_size
=
ctx
.
Attr
<
int
>
(
"hidden_size"
);
int
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
cache_var
=
ctx
.
InputVar
(
"Cache"
);
if
(
!
cache_var
)
{
// The RAW type cache variable wouldn't be created and broadcasted on
// multi-devices before the first running.
// use parent scope to make cache persistable
auto
*
scope
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
());
auto
cache_var_name
=
ctx
.
Inputs
(
"Cache"
)[
0
];
cache_var
=
scope
->
Var
(
cache_var_name
);
}
CudnnRNNCache
*
cudnn_rnn_cache
=
nullptr
;
if
(
cache_var
->
IsInitialized
())
{
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
}
else
{
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
std
::
random_device
rnd
;
int
seed
=
ctx
.
Attr
<
int
>
(
"seed"
);
if
(
seed
==
-
1
)
{
seed
=
rnd
();
}
auto
input_w_numel
=
w
->
numel
();
auto
batch_size
=
x
->
dims
()[
1
];
cudnn_rnn_cache
->
init
(
handle
,
ctx
,
max_len
,
batch_size
,
input_size
,
hidden_size
,
num_layers
,
dropout_prob
,
is_bidirec
,
seed
,
input_w_numel
);
}
auto
run_seq_len
=
x
->
dims
()[
0
];
if
(
is_test
)
{
// for inference
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardInference
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
));
}
else
{
// for train
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
reserve_size_
));
}
}
};
template
<
typename
T
>
class
CudnnLSTMGPUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
weight
=
ctx
.
Input
<
Tensor
>
(
"W"
);
auto
*
init_h
=
ctx
.
Input
<
Tensor
>
(
"InitH"
);
auto
*
init_c
=
ctx
.
Input
<
Tensor
>
(
"InitC"
);
// auto * last_h = ctx.Input<Tensor>("last_h");
// auto * last_c = ctx.Input<Tensor>("last_c");
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
out_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
last_h_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"last_h"
));
auto
*
last_c_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"last_c"
));
// auto* init_h = ctx.Input<Tensor>("init_h");
// auto* init_c = ctx.Input<Tensor>("init_c");
auto
*
in_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
*
weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"W"
));
auto
*
init_h_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"InitH"
));
auto
*
init_c_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"InitC"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
cache_var
=
ctx
.
InputVar
(
"Cache"
);
PADDLE_ENFORCE
(
cache_var
->
IsInitialized
());
CudnnRNNCache
*
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
auto
input_dims
=
input
->
dims
();
auto
weight_dims
=
weight
->
dims
();
auto
init_h_dims
=
init_h
->
dims
();
auto
init_c_dims
=
init_c
->
dims
();
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
zero
;
zero
(
dev_ctx
,
in_grad
,
static_cast
<
T
>
(
0.0
));
zero
(
dev_ctx
,
weight_grad
,
static_cast
<
T
>
(
0.0
));
T
*
init_h_grad_data
=
NULL
;
if
(
init_h_grad
==
nullptr
)
{
Tensor
init_h_grad_temp
;
init_h_grad_temp
.
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
init_h_grad_temp
,
static_cast
<
T
>
(
0.0
));
init_h_grad_data
=
init_h_grad_temp
.
data
<
T
>
();
}
else
{
init_h_grad
->
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
init_h_grad
,
static_cast
<
T
>
(
0.0
));
init_h_grad_data
=
init_h_grad
->
data
<
T
>
();
}
T
*
init_c_grad_data
=
NULL
;
if
(
init_c_grad
==
nullptr
)
{
Tensor
init_c_grad_temp
;
init_c_grad_temp
.
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
init_c_grad_temp
,
static_cast
<
T
>
(
0.0
));
init_c_grad_data
=
init_c_grad_temp
.
data
<
T
>
();
}
else
{
init_c_grad
->
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
init_c_grad
,
static_cast
<
T
>
(
0.0
));
init_c_grad_data
=
init_c_grad
->
data
<
T
>
();
}
const
T
*
last_h_grad_data
=
NULL
;
if
(
last_h_grad
==
nullptr
)
{
Tensor
last_h_grad_temp
;
last_h_grad_temp
.
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
last_h_grad_temp
,
static_cast
<
T
>
(
0.0
));
last_h_grad_data
=
(
const
T
*
)
last_h_grad_temp
.
data
<
T
>
();
}
else
{
last_h_grad_data
=
last_h_grad
->
data
<
T
>
();
}
const
T
*
last_c_grad_data
=
NULL
;
if
(
last_c_grad
==
nullptr
)
{
Tensor
last_c_grad_temp
;
last_c_grad_temp
.
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
last_c_grad_temp
,
static_cast
<
T
>
(
0.0
));
last_c_grad_data
=
(
const
T
*
)
last_c_grad_temp
.
data
<
T
>
();
}
else
{
last_c_grad_data
=
last_c_grad
->
data
<
T
>
();
}
const
T
*
out_grad_data
=
NULL
;
if
(
out_grad
==
nullptr
)
{
Tensor
out_grad_temp
;
out_grad_temp
.
mutable_data
<
T
>
(
out
->
dims
(),
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
out_grad_temp
,
static_cast
<
T
>
(
0.0
));
out_grad_data
=
(
const
T
*
)
out_grad_temp
.
data
<
T
>
();
}
else
{
out_grad_data
=
out_grad
->
data
<
T
>
();
}
// zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
// zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
auto
out_data
=
out
->
data
<
T
>
();
// auto out_grad_data = out_grad->data<T>();
auto
weight_data
=
weight
->
data
<
T
>
();
auto
init_h_data
=
init_h
->
data
<
T
>
();
auto
init_c_data
=
init_c
->
data
<
T
>
();
auto
in_grad_data
=
in_grad
->
data
<
T
>
();
auto
work_data
=
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
();
auto
reserve_data
=
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
();
auto
run_seq_len
=
input_dims
[
0
];
PADDLE_ENFORCE_LE
((
size_t
)
run_seq_len
,
cudnn_rnn_cache
->
max_length_
,
"cudnn running seq_len CAN not greater max_lengh"
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardData
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
dy_desc_
,
out_grad_data
,
cudnn_rnn_cache
->
dhy_desc_
,
last_h_grad_data
,
cudnn_rnn_cache
->
dcy_desc_
,
last_c_grad_data
,
cudnn_rnn_cache
->
w_desc_
,
weight_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
dx_desc_
,
in_grad_data
,
cudnn_rnn_cache
->
dhx_desc_
,
init_h_grad_data
,
cudnn_rnn_cache
->
dcx_desc_
,
init_c_grad_data
,
work_data
,
cudnn_rnn_cache
->
workspace_size_
,
reserve_data
,
cudnn_rnn_cache
->
reserve_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
input
->
data
<
T
>
(),
cudnn_rnn_cache
->
hx_desc_
,
init_h
->
data
<
T
>
(),
cudnn_rnn_cache
->
y_desc_
,
out
->
data
<
T
>
(),
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
cudnn_rnn_cache
->
dw_desc_
,
weight_grad
->
data
<
T
>
(),
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
reserve_size_
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
cudnn_lstm
,
ops
::
CudnnLSTMGPUKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
cudnn_lstm_grad
,
ops
::
CudnnLSTMGPUGradKernel
<
float
>
);
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
4f71a6ee
...
...
@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString);
__macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
__macro(cudnnRNNForwardTraining); \
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
4f71a6ee
...
...
@@ -169,6 +169,7 @@ __all__ = [
'log_loss'
,
'add_position_encoding'
,
'bilinear_tensor_product'
,
'lstm'
,
]
...
...
@@ -472,6 +473,168 @@ def dynamic_lstm(input,
return
hidden
,
cell
def
lstm
(
input
,
init_h
,
init_c
,
max_len
,
hidden_size
,
num_layers
,
dropout_prob
=
0.0
,
is_bidirec
=
False
,
is_test
=
False
,
name
=
None
,
default_initializer
=
None
,
seed
=-
1
):
"""
If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t =
\\
sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t =
\\
sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t =
\\
sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$
\\
tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t
\\
odot c_{t-1} + i_t
\\
odot
\\
tilde{c_t} $$
$$ h_t = o_t
\\
odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $
\t
ilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
Args:
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
init_h(Variable): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
hidden_size (int): hidden size of the LSTM
num_layers (int): total layers number of the LSTM
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
There is NO dropout work on rnn output of the last RNN layers
is_bidirec (bool): If it is bidirectional
is_test (bool): If it is in test phrase
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
default_initializer(Initialize|None): Where use initializer to initialize the Weight
If set None, defaule initializer will be used
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
Returns:
rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
last_h(Tensor): the hidden state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
last_c(Tensor): the cell state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
Examples:
.. code-block:: python
input = embedding
batch_size = 20
max_len = 100
dropout_prob = 0.2
input_size = 100
hidden_size = 150
num_layers = 1
init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c,
\
max_len, dropout_prob, input_size, hidden_size,
\
num_layers)
"""
helper
=
LayerHelper
(
'cudnn_lstm'
,
**
locals
())
dtype
=
input
.
dtype
input_shape
=
list
(
input
.
shape
)
input_size
=
input_shape
[
-
1
]
weight_size
=
0
for
i
in
range
(
num_layers
):
if
i
==
0
:
input_weight_size
=
(
input_size
*
hidden_size
)
*
4
else
:
if
is_bidirec
:
input_weight_size
=
(
hidden_size
*
2
*
hidden_size
)
*
4
else
:
input_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
hidden_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
if
is_bidirec
:
weight_size
+=
(
input_weight_size
+
hidden_weight_size
)
*
2
weight_size
+=
hidden_size
*
8
*
2
else
:
weight_size
+=
input_weight_size
+
hidden_weight_size
weight_size
+=
hidden_size
*
8
weight
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
weight_size
],
dtype
=
dtype
,
default_initializer
=
default_initializer
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
last_h
=
helper
.
create_variable_for_type_inference
(
dtype
)
last_c
=
helper
.
create_variable_for_type_inference
(
dtype
)
cache
=
helper
.
create_variable
(
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
helper
.
append_op
(
type
=
'cudnn_lstm'
,
inputs
=
{
'Input'
:
input
,
'InitH'
:
init_h
,
'InitC'
:
init_c
,
'W'
:
weight
,
'Cache'
:
cache
,
},
outputs
=
{
'Out'
:
out
,
'last_h'
:
last_h
,
'last_c'
:
last_c
,
},
attrs
=
{
'max_len'
:
max_len
,
'is_bidirec'
:
is_bidirec
,
'input_size'
:
input_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'is_test'
:
is_test
,
'dropout_prob'
:
dropout_prob
,
'seed'
:
seed
,
})
return
out
,
last_h
,
last_c
def
dynamic_lstmp
(
input
,
size
,
proj_size
,
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
4f71a6ee
...
...
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
self
.
dtype
)
outputs
=
append_input_output
(
block
,
op_proto
,
self
.
outputs
,
False
,
self
.
dtype
)
if
hasattr
(
self
,
"cache_name_list"
):
for
name
in
self
.
cache_name_list
:
inputs
[
name
]
=
block
.
create_var
(
name
=
name
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
op
=
block
.
append_op
(
type
=
self
.
op_type
,
inputs
=
inputs
,
...
...
@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
op_inputs
=
self
.
inputs
if
hasattr
(
self
,
"inputs"
)
else
dict
()
op_outputs
=
self
.
outputs
if
hasattr
(
self
,
"outputs"
)
else
dict
()
op_attrs
=
self
.
attrs
if
hasattr
(
self
,
"attrs"
)
else
dict
()
self
.
op
=
create_op
(
self
.
scope
,
self
.
op_type
,
op_inputs
,
op_outputs
,
op_attrs
)
cache_list
=
None
if
hasattr
(
self
,
"cache_name_list"
):
cache_list
=
self
.
cache_name_list
self
.
op
=
create_op
(
self
.
scope
,
self
.
op_type
,
op_inputs
,
op_outputs
,
op_attrs
,
cache_list
=
cache_list
)
if
no_grad_set
is
None
:
no_grad_set
=
set
()
...
...
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
0 → 100644
浏览文件 @
4f71a6ee
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
import
paddle.fluid
as
fluid
SIGMOID_THRESHOLD_MIN
=
-
40.0
SIGMOID_THRESHOLD_MAX
=
13.0
EXP_MAX_INPUT
=
40.0
def
lstm_naive
(
input
,
w
,
):
seq_len
,
batch_size
,
hidden_size
=
input
.
shape
offset
=
0
wi
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wo
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ri
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ro
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
bi_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bi_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_2
=
w
[
offset
:
offset
+
hidden_size
]
def
sigmoid
(
x
):
y
=
np
.
copy
(
x
)
y
[
x
<
SIGMOID_THRESHOLD_MIN
]
=
SIGMOID_THRESHOLD_MIN
y
[
x
>
SIGMOID_THRESHOLD_MAX
]
=
SIGMOID_THRESHOLD_MAX
return
1.
/
(
1.
+
np
.
exp
(
-
y
))
def
tanh
(
x
):
y
=
-
2.
*
x
y
[
y
>
EXP_MAX_INPUT
]
=
EXP_MAX_INPUT
return
(
2.
/
(
1.
+
np
.
exp
(
y
)))
-
1.
output
=
[]
pre_h
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
pre_c
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
for
i
in
range
(
seq_len
):
emb_1
=
input
[
i
]
input_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wi
)
+
np
.
matmul
(
pre_h
,
ri
)
+
bi_1
+
bi_2
)
forget_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wf
)
+
np
.
matmul
(
pre_h
,
rf
)
+
bf_1
+
bf_2
)
output_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wo
)
+
np
.
matmul
(
pre_h
,
ro
)
+
bo_1
+
bo_2
)
c_t_temp
=
tanh
(
np
.
matmul
(
emb_1
,
wc
)
+
np
.
matmul
(
pre_h
,
rc
)
+
bc_1
+
bc_2
)
new_c
=
input_gate
*
c_t_temp
+
forget_gate
*
pre_c
new_h
=
output_gate
*
tanh
(
new_c
)
pre_h
=
new_h
pre_c
=
new_c
output
.
append
(
new_h
)
output
=
np
.
concatenate
(
output
,
-
1
)
output
=
output
.
reshape
((
batch_size
,
-
1
,
hidden_size
))
output
=
output
.
transpose
((
1
,
0
,
2
))
return
output
,
pre_h
,
pre_c
class
TestCUDNNLstmOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"cudnn_lstm"
self
.
dtype
=
np
.
float32
num_steps
=
20
batch_size
=
5
hidden_size
=
20
input_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
hidden_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
weight_size
=
input_weight_size
+
hidden_weight_size
weight_size
+=
hidden_size
*
8
input
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
num_steps
,
batch_size
,
hidden_size
)).
astype
(
self
.
dtype
)
flat_w
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
weight_size
)).
astype
(
self
.
dtype
)
output
,
last_hidden
,
last_cell
=
lstm_naive
(
input
,
flat_w
)
init_h
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
np
.
float32
)
init_c
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
np
.
float32
)
scope
=
core
.
Scope
()
program
=
fluid
.
Program
()
block
=
program
.
global_block
()
cache_temp
=
block
.
create_var
(
name
=
"Cache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
'W'
:
OpTest
.
np_dtype_to_fluid_dtype
(
flat_w
),
'InitH'
:
OpTest
.
np_dtype_to_fluid_dtype
(
init_h
),
'InitC'
:
OpTest
.
np_dtype_to_fluid_dtype
(
init_c
),
}
self
.
cache_name_list
=
[
'Cache'
]
self
.
attrs
=
{
'max_len'
:
num_steps
,
'dropout_prob'
:
0.0
,
'is_bidirec'
:
False
,
'input_size'
:
hidden_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
1
,
}
self
.
outputs
=
{
'Out'
:
output
,
"last_h"
:
last_hidden
,
'last_c'
:
last_cell
}
def
test_output_with_place
(
self
):
if
self
.
testcuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
atol
=
1e-5
)
def
test_grad_with_place
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'W'
,
'InitH'
,
'InitC'
]),
[
'Out'
,
'last_h'
,
'last_c'
],
max_relative_error
=
0.02
)
def
testcuda
(
self
):
return
core
.
is_compiled_with_cuda
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
4f71a6ee
...
...
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
from
paddle.fluid.op
import
Operator
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
,
cache_list
=
None
):
kwargs
=
dict
()
op_maker
=
core
.
op_proto_and_checker_maker
...
...
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
__create_var__
(
in_name
,
sub_in_name
)
else
:
__create_var__
(
in_name
,
in_name
)
if
cache_list
!=
None
and
isinstance
(
cache_list
,
list
):
for
name
in
cache_list
:
kwargs
[
name
]
=
[]
scope
.
var
(
name
)
kwargs
[
name
].
append
(
name
)
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
op_type
):
if
out_name
in
outputs
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录