提交 d1a17cad 编写于 作者: P phlrain

fix cudnn rnn; test=develop

上级 487ee36a
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
"The will affect the shape of the Out, last_h, and last_c") "The will affect the shape of the Out, last_h, and last_c")
.SetDefault(false); .SetDefault(false);
AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10); AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
AddAttr<int>("batch_size", "the instance number the batch").SetDefault(10);
AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100); AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
AddAttr<int>("num_layers", "the total layer number of the LSTM") AddAttr<int>("num_layers", "the total layer number of the LSTM")
.SetDefault(1); .SetDefault(1);
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<bool>("fix_seed", "True if it fix dropout seed").SetDefault(false); AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
CUDNN LSTM implementation CUDNN LSTM implementation
...@@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections. ...@@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
it = σ(Wi X xt + Ri X ht-1 + bWi + bRi) $$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf)
ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo)
c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
ct = ft * ct-1 + it * c't
ht = ot * tanh(ct)
Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, $$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication X represensts a matrix multiplication
and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
)DOC"); )DOC");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
size_t max_len = ctx.Attr<int>("max_len"); size_t max_len = ctx.Attr<int>("max_len");
float dropout_prob = ctx.Attr<float>("dropout_prob"); float dropout_prob = ctx.Attr<float>("dropout_prob");
bool is_bidirec = ctx.Attr<bool>("is_bidirec"); bool is_bidirec = ctx.Attr<bool>("is_bidirec");
int batch_size = ctx.Attr<int>("batch_size");
int input_size = ctx.Attr<int>("input_size"); int input_size = ctx.Attr<int>("input_size");
int hidden_size = ctx.Attr<int>("hidden_size"); int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers"); int num_layers = ctx.Attr<int>("num_layers");
...@@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var) cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>(); ->GetMutable<CudnnRNNCache>();
std::random_device rnd; std::random_device rnd;
int seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : rnd(); int seed = ctx.Attr<int>("seed");
if (seed == -1) {
seed = rnd();
}
auto input_w_numel = w->numel(); auto input_w_numel = w->numel();
auto batch_size = x->dims()[1];
cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
hidden_size, num_layers, dropout_prob, is_bidirec, hidden_size, num_layers, dropout_prob, is_bidirec,
seed, input_w_numel); seed, input_w_numel);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -29,7 +29,10 @@ using Tensor = framework::Tensor; ...@@ -29,7 +29,10 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class CudnnLSTMKernel : public framework::OpKernel<T> { class CudnnLSTMKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override {} void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(
"CPU is not support for this kernel now. Will be add in the future");
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -169,7 +169,7 @@ __all__ = [ ...@@ -169,7 +169,7 @@ __all__ = [
'log_loss', 'log_loss',
'add_position_encoding', 'add_position_encoding',
'bilinear_tensor_product', 'bilinear_tensor_product',
'cudnn_lstm', 'lstm',
] ]
...@@ -467,39 +467,53 @@ def dynamic_lstm(input, ...@@ -467,39 +467,53 @@ def dynamic_lstm(input,
return hidden, cell return hidden, cell
def cudnn_lstm(input, def lstm(input,
init_h, init_h,
init_c, init_c,
batch_size, max_len,
max_len, dropout_prob,
dropout_prob, input_size,
input_size, hidden_size,
hidden_size, num_layers,
num_layers, is_bidirec=False,
is_bidirec=False, dtype='float32',
dtype='float32', is_test=False,
is_test=False, name=None,
name=None, default_initializer=None,
default_initializer=None, seed=-1):
fix_seed=False,
seed=0):
""" """
CUDNN LSTM implementation If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections. A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi) $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf)
ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo) $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
ct = ft * ct-1 + it * c't $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
ht = ot * tanh(ct)
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication X represensts a matrix multiplication
and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
Args: Args:
...@@ -510,7 +524,6 @@ def cudnn_lstm(input, ...@@ -510,7 +524,6 @@ def cudnn_lstm(input,
init_c(Variable): The initial cell state of the LSTM. init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size ) This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
batch_size (int): total distance numer of the batch
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
There is NO dropout work on rnn output of the last RNN layers There is NO dropout work on rnn output of the last RNN layers
...@@ -524,9 +537,7 @@ def cudnn_lstm(input, ...@@ -524,9 +537,7 @@ def cudnn_lstm(input,
will be named automatically. will be named automatically.
default_initializer(Initialize|None): Where use initializer to initialize the Weight default_initializer(Initialize|None): Where use initializer to initialize the Weight
If set None, defaule initializer will be used If set None, defaule initializer will be used
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
fix_seed(bool): If it's True, fix seed will used for dropout in LSTM
seed(int): If fix_seed is True, dropout seed in LSTM will use this seed
Returns: Returns:
...@@ -553,7 +564,7 @@ def cudnn_lstm(input, ...@@ -553,7 +564,7 @@ def cudnn_lstm(input,
init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \ rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
max_len, dropout_prob, input_size, hidden_size, \ max_len, dropout_prob, input_size, hidden_size, \
num_layers) num_layers)
""" """
...@@ -610,12 +621,10 @@ def cudnn_lstm(input, ...@@ -610,12 +621,10 @@ def cudnn_lstm(input,
'max_len': max_len, 'max_len': max_len,
'is_bidirec': is_bidirec, 'is_bidirec': is_bidirec,
'input_size': input_size, 'input_size': input_size,
'batch_size': batch_size,
'hidden_size': hidden_size, 'hidden_size': hidden_size,
'num_layers': num_layers, 'num_layers': num_layers,
'is_test': is_test, 'is_test': is_test,
'dropout_prob': dropout_prob, 'dropout_prob': dropout_prob,
'fix_seed': fix_seed,
'seed': seed, 'seed': seed,
}) })
return out, last_h, last_c return out, last_h, last_c
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册