提交 d1a17cad 编写于 作者: P phlrain

fix cudnn rnn; test=develop

上级 487ee36a
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
"The will affect the shape of the Out, last_h, and last_c")
.SetDefault(false);
AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
AddAttr<int>("batch_size", "the instance number the batch").SetDefault(10);
AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
AddAttr<int>("num_layers", "the total layer number of the LSTM")
.SetDefault(1);
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<bool>("fix_seed", "True if it fix dropout seed").SetDefault(false);
AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
AddComment(R"DOC(
CUDNN LSTM implementation
......@@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
it = σ(Wi X xt + Ri X ht-1 + bWi + bRi)
ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf)
ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo)
c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
ct = ft * ct-1 + it * c't
ht = ot * tanh(ct)
$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
)DOC");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
size_t max_len = ctx.Attr<int>("max_len");
float dropout_prob = ctx.Attr<float>("dropout_prob");
bool is_bidirec = ctx.Attr<bool>("is_bidirec");
int batch_size = ctx.Attr<int>("batch_size");
int input_size = ctx.Attr<int>("input_size");
int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers");
......@@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
std::random_device rnd;
int seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : rnd();
int seed = ctx.Attr<int>("seed");
if (seed == -1) {
seed = rnd();
}
auto input_w_numel = w->numel();
auto batch_size = x->dims()[1];
cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
hidden_size, num_layers, dropout_prob, is_bidirec,
seed, input_w_numel);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -29,7 +29,10 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class CudnnLSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {}
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(
"CPU is not support for this kernel now. Will be add in the future");
}
};
template <typename DeviceContext, typename T>
......
......@@ -169,7 +169,7 @@ __all__ = [
'log_loss',
'add_position_encoding',
'bilinear_tensor_product',
'cudnn_lstm',
'lstm',
]
......@@ -467,39 +467,53 @@ def dynamic_lstm(input,
return hidden, cell
def cudnn_lstm(input,
init_h,
init_c,
batch_size,
max_len,
dropout_prob,
input_size,
hidden_size,
num_layers,
is_bidirec=False,
dtype='float32',
is_test=False,
name=None,
default_initializer=None,
fix_seed=False,
seed=0):
def lstm(input,
init_h,
init_c,
max_len,
dropout_prob,
input_size,
hidden_size,
num_layers,
is_bidirec=False,
dtype='float32',
is_test=False,
name=None,
default_initializer=None,
seed=-1):
"""
CUDNN LSTM implementation
If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi)
ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf)
ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo)
c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
ct = ft * ct-1 + it * c't
ht = ot * tanh(ct)
$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
Args:
......@@ -510,7 +524,6 @@ def cudnn_lstm(input,
init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
batch_size (int): total distance numer of the batch
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
There is NO dropout work on rnn output of the last RNN layers
......@@ -524,9 +537,7 @@ def cudnn_lstm(input,
will be named automatically.
default_initializer(Initialize|None): Where use initializer to initialize the Weight
If set None, defaule initializer will be used
fix_seed(bool): If it's True, fix seed will used for dropout in LSTM
seed(int): If fix_seed is True, dropout seed in LSTM will use this seed
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
Returns:
......@@ -553,7 +564,7 @@ def cudnn_lstm(input,
init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \
rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
max_len, dropout_prob, input_size, hidden_size, \
num_layers)
"""
......@@ -610,12 +621,10 @@ def cudnn_lstm(input,
'max_len': max_len,
'is_bidirec': is_bidirec,
'input_size': input_size,
'batch_size': batch_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'is_test': is_test,
'dropout_prob': dropout_prob,
'fix_seed': fix_seed,
'seed': seed,
})
return out, last_h, last_c
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册