From ba78a0c7c27474ac05b1633a29ca536f85212d46 Mon Sep 17 00:00:00 2001 From: Youwei Song Date: Thu, 10 Oct 2019 19:14:23 +0800 Subject: [PATCH] polish 3 LSTM en doc (#20383) * polish en doc, test=document_fix, test=develop * update sample code, test=document_fix, test=develop * fix sample typo, test=document_fix, test=develop --- paddle/fluid/API.spec | 6 +- python/paddle/fluid/layers/nn.py | 438 +++++++++++++++---------------- 2 files changed, 217 insertions(+), 227 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 764a302d1d7..dbcc514a544 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -130,8 +130,8 @@ paddle.fluid.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], var paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', 'e28421f1253a3545d9bfe81a8028ea68')) paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '18112442f55b5862bbec8feee841c905')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5')) -paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2')) -paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a')) +paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed')) +paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec')) paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b')) @@ -290,7 +290,7 @@ paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756')) paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164')) paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d')) -paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'baa7327ed89df6b7bdd32f9ffdb62f63')) +paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', '5193cf1113f9d8d8f682ee5a5fc8b391')) paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '276a1213dd431228cefa33c3146df34a')) paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', 'd5945431cdcae3cda21914db5bbf383e')) paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fa761e964e1..2b0759518dc 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -667,7 +667,6 @@ def _pull_box_sparse(input, size, dtype='float32'): return outs -@templatedoc(op_type="lstm") def dynamic_lstm(input, size, h_0=None, @@ -682,58 +681,82 @@ def dynamic_lstm(input, dtype='float32', name=None): """ - ${comment} + **Note**: + 1. This OP only supports LoDTensor as inputs. If you need to deal with Tensor, please use :ref:`api_fluid_layers_lstm` . + 2. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP. - Args: - input (Variable): ${input_comment} - size (int): 4 * hidden size. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the hidden size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. + The implementation of this OP include diagonal/peephole connections. + Please refer to `Gers, F. A., & Schmidhuber, J. (2000) `_ . + If you do not need peephole connections, please set use_peepholes to False . - - Weights = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`} - - The shape is (D x 4D), where D is the hidden - size. + This OP computes each timestep as follows: - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias + .. math:: + i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i}) + .. math:: + f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f}) + .. math:: + o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o}) + .. math:: + \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c}) + .. math:: + c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t} + .. math:: + h_t = o_t \odot tanh(c_t) + + The symbolic meanings in the formula are as follows: + + - :math:`x_{t}` represents the input at timestep :math:`t` + - :math:`h_{t}` represents the hidden state at timestep :math:`t` + - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively + - :math:`\widetilde{c_t}` represents the candidate cell state + - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively + - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` ) + - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate) + - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid + - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension + + Parameters: + input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64. + size (int): must be 4 * hidden_size. + h_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` . + Data type is float32 or float64. If set to None, it will be a vector of all 0. Default: None. + c_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` . + Data type is float32 or float64. If set to None, it will be a vector of all 0. `h_0` and `c_0` can be None but only at the same time. Default: None. + param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' . + If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None. + + - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [hidden_size, 4*hidden_size]. + + bias_attr (ParamAttr, optional): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if setting `use_peepholes` to `True`. + Please refer to ref:`api_fluid_ParamAttr' . Default: None. 1. `use_peepholes = False` - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). + - The shape is [1, 4*hidden_size]. 2. `use_peepholes = True` - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes (bool): ${use_peepholes_comment} - is_reverse (bool): ${is_reverse_comment} - gate_activation (str): ${gate_activation_comment} - cell_activation (str): ${cell_activation_comment} - candidate_activation (str): ${candidate_activation_comment} - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + - The shape is [1, 7*hidden_size]. + + use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True. + is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False. + gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid". + cell_activation (str, optional): The activation for cell output. Default: "tanh". + candidate_activation (str, optional): The activation for candidate hidden state. Default: "tanh". + dtype (str, optional): Data type, can be "float32" or "float64". Default: "float32". + name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None. Returns: - tuple: The hidden state, and cell state of LSTM. The shape of both \ - is (T x D), and lod is the same with the `input`. + tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) : + + The hidden state and cell state of LSTM + + - hidden: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input. + - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input. Examples: .. code-block:: python @@ -743,15 +766,16 @@ def dynamic_lstm(input, vocab_size = 10000 hidden_dim = 512 - data = fluid.layers.data(name='x', shape=[1], - dtype='int32', lod_level=1) - emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True) + data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1) + emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True) forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( + forward, cell = fluid.layers.dynamic_lstm( input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + forward.shape # (-1, 512) + cell.shape # (-1, 512) """ assert in_dygraph_mode( ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!" @@ -813,77 +837,76 @@ def lstm(input, default_initializer=None, seed=-1): """ - If Device is GPU, This op will use cudnn LSTM implementation - - A four-gate Long Short-Term Memory network with no peephole connections. - In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, - the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: - - .. math:: - - i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) - - f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) - - o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) + **Note**: + This OP only supports running on GPU devices. - \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) + This OP implements LSTM operation - `Hochreiter, S., & Schmidhuber, J. (1997) `_ . - c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + The implementation of this OP does not include diagonal/peephole connections. + Please refer to `Gers, F. A., & Schmidhuber, J. (2000) `_ . + If you need peephole connections, please use :ref:`api_fluid_layers_dynamic_lstm` . - h_t &= o_t \odot tanh(c_t) + This OP computes each timestep as follows: - - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix - of weights from the input gate to the input) - - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). - - sigmoid is the logistic sigmoid function. - - $i, f, o$ and $c$ are the input gate, forget gate, output gate, - and cell activation vectors, respectively, all of which have the same size as - the cell output activation vector $h$. - - The :math:`\odot` is the element-wise product of the vectors. - - :math:`tanh` is the activation functions. - - :math:`\\tilde{c_t}` is also called candidate hidden state, - which is computed based on the current input and the previous hidden state. + .. math:: + i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i}) + .. math:: + f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f}) + .. math:: + o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o}) + .. math:: + \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c}) + .. math:: + c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t} + .. math:: + h_t = o_t \odot tanh(c_t) - Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, - X represensts a matrix multiplication + The symbolic meanings in the formula are as follows: + - :math:`x_{t}` represents the input at timestep :math:`t` + - :math:`h_{t}` represents the hidden state at timestep :math:`t` + - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively + - :math:`\widetilde{c_t}` represents the candidate cell state + - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively + - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` ) + - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate) + - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid + - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension - Args: - input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) - init_h(Variable): The initial hidden state of the LSTM - This is a tensor with shape ( num_layers x batch_size x hidden_size) - if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - init_c(Variable): The initial cell state of the LSTM. - This is a tensor with shape ( num_layers x batch_size x hidden_size ) - if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len - hidden_size (int): hidden size of the LSTM - num_layers (int): total layers number of the LSTM - dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps - There is NO dropout work on rnn output of the last RNN layers - is_bidirec (bool): If it is bidirectional - is_test (bool): If it is in test phrase - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - default_initializer(Initialize|None): Where use initializer to initialize the Weight - If set None, defaule initializer will be used - seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed + Parameters: + input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64 + init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` . + If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64. + init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` . + If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64. + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len. + hidden_size (int): hidden size of the LSTM. + num_layers (int): total layers number of the LSTM. + dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps + There is NO dropout work on rnn output of the last RNN layers. + Default: 0.0. + is_bidirec (bool, optional): If it is bidirectional. Default: False. + is_test (bool, optional): If it is in test phrase. Default: False. + name (str, optional): A name for this layer. If set None, the layer + will be named automatically. Default: None. + default_initializer(Initializer, optional): Where use initializer to initialize the Weight + If set None, defaule initializer will be used. Default: None. + seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1. Returns: - rnn_out(Tensor),last_h(Tensor),last_c(Tensor): + tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ) : Three tensors, rnn_out, last_h, last_c: - - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ - if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + - rnn_out is result of LSTM hidden, shape is :math:`[seq\_len, batch\_size, hidden\_size]` \ + if is_bidirec set to True, shape will be :math:`[seq\_len, batch\_size, hidden\_size*2]` - last_h is the hidden state of the last step of LSTM \ - shape is ( num_layers x batch_size x hidden_size ) \ - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + shape is :math:`[num\_layers, batch\_size, hidden\_size]` \ + if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]` - last_c(Tensor): the cell state of the last step of LSTM \ - shape is ( num_layers x batch_size x hidden_size ) \ - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + shape is :math:`[num\_layers, batch\_size, hidden\_size]` \ + if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]` Examples: @@ -894,9 +917,8 @@ def lstm(input, emb_dim = 256 vocab_size = 10000 - data = fluid.layers.data(name='x', shape=[-1, 100, 1], - dtype='int32') - emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True) + data = fluid.data(name='x', shape=[None, 100], dtype='int64') + emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True) batch_size = 20 max_len = 100 dropout_prob = 0.2 @@ -908,6 +930,9 @@ def lstm(input, rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \ max_len, hidden_size, num_layers, \ dropout_prob=dropout_prob) + rnn_out.shape # (-1, 100, 150) + last_h.shape # (1, 20, 150) + last_c.shape # (1, 20, 150) """ helper = LayerHelper('cudnn_lstm', **locals()) @@ -992,138 +1017,102 @@ def dynamic_lstmp(input, cell_clip=None, proj_clip=None): """ - **Dynamic LSTMP Layer** - - LSTMP (LSTM with recurrent projection) layer has a separate projection - layer after the LSTM layer, projecting the original hidden state to a - lower-dimensional one, which is proposed to reduce the number of total - parameters and furthermore computational complexity for the LSTM, - espeacially for the case that the size of output units is relative - large (https://research.google.com/pubs/archive/43905.pdf). - - The formula is as follows: - - .. math:: - - i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) - - f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) - - \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) - - o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) + **Note**: + 1. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP. - c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + This OP implements the LSTMP (LSTM Projected) layer. + The LSTMP layer has a separate linear mapping layer behind the LSTM layer. -- `Sak, H., Senior, A., & Beaufays, F. (2014) `_ . - h_t & = o_t \odot act_h(c_t) + Compared with the standard LSTM layer, LSTMP has an additional linear mapping layer, + which is used to map from the original hidden state :math:`h_t` to the lower dimensional state :math:`r_t` . + This reduces the total number of parameters and computational complexity, especially when the output unit is relatively large. - r_t & = \overline{act_h}(W_{rh}h_t) + The default implementation of the OP contains diagonal/peephole connections, + please refer to `Gers, F. A., & Schmidhuber, J. (2000) `_ . + If you need to disable the peephole connections, set use_peepholes to False. - In the above formula: + This OP computes each timestep as follows: - * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \ - the matrix of weights from the input gate to the input). - * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \ - matrices for peephole connections. In our implementation, \ - we use vectors to represent these diagonal weight matrices. - * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \ - bias vector). - * :math:`\sigma`: The activation, such as logistic sigmoid function. - * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \ - gate, and cell activation vectors, respectively, all of which have \ - the same size as the cell output activation vector :math:`h`. - * :math:`h`: The hidden state. - * :math:`r`: The recurrent projection of the hidden state. - * :math:`\\tilde{c_t}`: The candidate hidden state, whose \ - computation is based on the current input and previous hidden state. - * :math:`\odot`: The element-wise product of the vectors. - * :math:`act_g` and :math:`act_h`: The cell input and cell output \ - activation functions and `tanh` is usually used for them. - * :math:`\overline{act_h}`: The activation function for the projection \ - output, usually using `identity` or same as :math:`act_h`. + .. math:: + i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) + .. math:: + f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) + .. math:: + o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_{t-1} + b_o) + .. math:: + \widetilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) + .. math:: + c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t} + .. math:: + h_t = o_t \odot act_h(c_t) + .. math:: + r_t = \overline{act_h}(W_{rh}h_t) - Set `use_peepholes` to `False` to disable peephole connection. The formula - is omitted here, please refer to the paper - http://www.bioinf.jku.at/publications/older/2604.pdf for details. + The symbolic meanings in the formula are as follows: - Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}` - operations on the input :math:`x_{t}` are NOT included in this operator. - Users can choose to use fully-connected layer before LSTMP layer. + - :math:`x_{t}` represents the input at timestep :math:`t` + - :math:`h_{t}` represents the hidden state at timestep :math:`t` + - :math:`r_{t}` : represents the state of the projected output of the hidden state :math:`h_{t}` + - :math:`h_{t-1}, c_{t-1}, r_{t-1}` represent the hidden state, cell state and projected output at timestep :math:`t-1` , respectively + - :math:`\widetilde{c_t}` represents the candidate cell state + - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively + - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` ) + - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate) + - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid + - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension - Args: - input(Variable): The input of dynamic_lstmp layer, which supports - variable-time length input sequence. The underlying - tensor in this Variable is a matrix with shape - (T X 4D), where T is the total time steps in this - mini-batch, D is the hidden size. - size(int): 4 * hidden size. + Parameters: + input( :ref:`api_guide_Variable_en` ): The input of dynamic_lstmp layer, which supports + variable-time length input sequence. + It is a multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64. + size(int): must be 4 * hidden_size. proj_size(int): The size of projection output. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weight and projection weight. + param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' . + If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None. - - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`}. - - The shape of hidden-hidden weight is (P x 4D), - where P is the projection size and D the hidden - size. - - Projection weight = {:math:`W_{rh}`}. - - The shape of projection weight is (D x P). + - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [P, 4*hidden_size] , where P is the projection size. + - Projection weight = :math:`\{ W_{rh} \}` , the shape is [hidden_size, P]. - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr(ParamAttr|None): The bias attribute for the learnable bias + bias_attr (ParamAttr, optional): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if setting `use_peepholes` to `True`. + Please refer to ref:`api_fluid_ParamAttr' . Default: None. 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is [1, 4*hidden_size]. 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes(bool): Whether to enable diagonal/peephole connections, - default `True`. - is_reverse(bool): Whether to compute reversed LSTM, default `False`. - gate_activation(str): The activation for input gate, forget gate and - output gate. Choices = ["sigmoid", "tanh", "relu", - "identity"], default "sigmoid". - cell_activation(str): The activation for cell output. Choices = ["sigmoid", - "tanh", "relu", "identity"], default "tanh". - candidate_activation(str): The activation for candidate hidden state. - Choices = ["sigmoid", "tanh", "relu", "identity"], - default "tanh". - proj_activation(str): The activation for projection output. - Choices = ["sigmoid", "tanh", "relu", "identity"], - default "tanh". - dtype(str): Data type. Choices = ["float32", "float64"], default "float32". - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the projection size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - cell_clip(float): If provided the cell state is clipped - by this value prior to the cell output activation. - proj_clip(float): If `num_proj > 0` and `proj_clip` is + - The shape is [1, 7*hidden_size]. + + use_peepholes (bool, optional): Whether to use peephole connection or not. Default True. + is_reverse (bool, optional): Whether to calculate reverse LSTM. Default False. + gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default "sigmoid". + cell_activation (str, optional): The activation for cell output. Default "tanh". + candidate_activation (str, optional): The activation for candidate hidden state. Default "tanh". + proj_activation(str, optional): The activation for projection output. Default "tanh". + dtype (str, optional): Data type, can be "float32" or "float64". Default "float32". + name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None. + h_0( :ref:`api_guide_Variable` , optional): The initial hidden state is an optional input, default is zero. + This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. Default: None. + c_0( :ref:`api_guide_Variable` , optional): The initial cell state is an optional input, default is zero. + This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. + `h_0` and `c_0` can be None but only at the same time. Default: None. + cell_clip(float, optional): If not None, the cell state is clipped + by this value prior to the cell output activation. Default: None. + proj_clip(float, optional): If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within - `[-proj_clip, proj_clip]`. + `[-proj_clip, proj_clip]`. Default: None. Returns: - tuple: A tuple of two output variable: the projection of hidden state, \ - and cell state of LSTMP. The shape of projection is (T x P), \ - for the cell state which is (T x D), and both LoD is the same \ - with the `input`. + tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) : + + The hidden state and cell state of LSTMP + + - hidden: LoDTensor with shape of :math:`[T, P]` , and its lod and dtype is the same as the input. + - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input. Examples: @@ -1131,19 +1120,20 @@ def dynamic_lstmp(input, import paddle.fluid as fluid dict_dim, emb_dim = 128, 64 - data = fluid.layers.data(name='sequence', shape=[1], - dtype='int32', lod_level=1) - emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + data = fluid.data(name='sequence', shape=[None], dtype='int64', lod_level=1) + emb = fluid.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim, proj_dim = 512, 256 fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4, - act=None, bias_attr=None) - proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, - size=hidden_dim * 4, - proj_size=proj_dim, - use_peepholes=False, - is_reverse=True, - cell_activation="tanh", - proj_activation="tanh") + act=None, bias_attr=None) + proj_out, last_c = fluid.layers.dynamic_lstmp(input=fc_out, + size=hidden_dim * 4, + proj_size=proj_dim, + use_peepholes=False, + is_reverse=True, + cell_activation="tanh", + proj_activation="tanh") + proj_out.shape # (-1, 256) + last_c.shape # (-1, 512) """ assert in_dygraph_mode( -- GitLab