diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index 3b90b64b4effacf7240fb1bee8c0aa44251ad727..afb095a04e73c2f09b828c01630ef2347ff49613 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -117,7 +117,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("C0", "(Tensor, optional) the initial cell state is an optional " "input. This is a tensor with shape (N x D), where N is the " - "batch size. `H0` and `C0` can be NULL but only at the same time") + "batch size. `H0` and `C0` can be NULL but only at the same time.") .AsDispensable(); AddInput("Weight", "(Tensor) the learnable hidden-hidden weights." diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 4e8fd407c9983e2827d3137fa4f49a8425d5dce2..7759ce6af6060fbaa5724763184f39198e6eedac 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -227,6 +227,106 @@ def dynamic_lstm(input, cell_activation='tanh', candidate_activation='tanh', dtype='float32'): + """ + **Dynamic LSTM Layer** + + The defalut implementation is diagonal/peephole connection + (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: + + .. math: + + i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\ + + f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\ + + \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\ + + o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\ + + c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + + h_t = o_t \odot act_h(c_t) + + where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix + of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ + are diagonal weight matrices for peephole connections. In our implementation, + we use vectors to reprenset these diagonal weight matrices. The b terms + denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ + is the non-line activations, such as logistic sigmoid function, and + $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + + The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ + are the cell input and cell output activation functions and `tanh` is usually + used for them. $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + + Set `use_peepholes` False to disable peephole connection. The formula + is omitted here, please refer to the paper + http://www.bioinf.jku.at/publications/older/2604.pdf for details. + + Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ + operations on the input $x_{t}$ are NOT included in this operator. + Users can choose to use fully-connect operator before LSTM operator. + + Args: +def dynamic_lstm(input, + size, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + dtype='float32'): + input(Variable): The input of dynamic_lstm layer, which support + variable-time length input sequence. The underlying tensor in + this Variable is a matrix with shape (T X 4D), where T is the + total time steps in this mini-batch, D is the hidden size. + size(int): The size of input. + param_attr(ParamAttr): The parameter attribute for the learnable + hidden-hidden weights. + - The shape is (D x 4D), where D is the hidden size. + - param_attr = {W_ch, W_ih, W_fh, W_oh} + bias_attr(ParamAttr): The bias attribute for the learnable bias + weights, which contains two parts: input-hidden bias weight + and peephole connections weight if setting `use_peepholes` to True. + 1. `use_peepholes = False` + - The shape is (1 x 4D). + - Bias = {b_c, b_i, b_f, b_o}. + 2. `use_peepholes = True` + - The shape is (1 x 7D). + - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}. + use_peepholes(bool, defalut: True): whether to enable diagonal/peephole + connections. + is_reverse(bool, defalut: False): whether to compute reversed LSTM. + gate_activation(string, choices: "sigmoid", "tanh", "relu", "identity", + default: "sigmoid"): The activation for input gate, forget gate and + output gate. + cell_activation(string, choices: "sigmoid", "tanh", "relu", "identity", + default: "tanh"): The activation for cell output. + candidate_activation(string, choices: "sigmoid", "tanh", "relu", + "identity", default: "tanh"): The activation for candidate hidden + state. + dtype(string, ) + + Returns: + hidden(Variable): the hidden state of LSTM layer. The shape is (T x D), + and lod is the same with the `input`. + cell(Variable): the cell state of LSTM layer. The shape is (T x D), and + lod is the same with the `input`. + + Example: + .. code-block:: python + + hidden_dim = 512 + forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + act='tanh', bias_attr=True) + forward, _ = fluid.layers.dynamic_lstm( + input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + """ helper = LayerHelper('lstm', **locals()) size = size / 4 weight = helper.create_parameter(