提交 06be9543 编写于 作者: Y Youwei Song 提交者: hong

[cherry-pick] #20383 (#20436)

test=release/1.6, test=document_fix
上级 b3fd414d
...@@ -123,8 +123,8 @@ paddle.fluid.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], var ...@@ -123,8 +123,8 @@ paddle.fluid.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], var
paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', 'e28421f1253a3545d9bfe81a8028ea68')) paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', 'e28421f1253a3545d9bfe81a8028ea68'))
paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '18112442f55b5862bbec8feee841c905')) paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '18112442f55b5862bbec8feee841c905'))
paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5'))
paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed'))
paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec'))
paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b')) paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b'))
...@@ -283,7 +283,7 @@ paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta ...@@ -283,7 +283,7 @@ paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta
paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756')) paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756'))
paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164')) paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164'))
paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d')) paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d'))
paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'baa7327ed89df6b7bdd32f9ffdb62f63')) paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', '5193cf1113f9d8d8f682ee5a5fc8b391'))
paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '276a1213dd431228cefa33c3146df34a')) paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '276a1213dd431228cefa33c3146df34a'))
paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', 'd5945431cdcae3cda21914db5bbf383e')) paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', 'd5945431cdcae3cda21914db5bbf383e'))
paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb'))
......
...@@ -653,7 +653,6 @@ def _pull_box_sparse(input, size, dtype='float32'): ...@@ -653,7 +653,6 @@ def _pull_box_sparse(input, size, dtype='float32'):
return outs return outs
@templatedoc(op_type="lstm")
def dynamic_lstm(input, def dynamic_lstm(input,
size, size,
h_0=None, h_0=None,
...@@ -668,58 +667,82 @@ def dynamic_lstm(input, ...@@ -668,58 +667,82 @@ def dynamic_lstm(input,
dtype='float32', dtype='float32',
name=None): name=None):
""" """
${comment} **Note**:
1. This OP only supports LoDTensor as inputs. If you need to deal with Tensor, please use :ref:`api_fluid_layers_lstm` .
2. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
Args: The implementation of this OP include diagonal/peephole connections.
input (Variable): ${input_comment} Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
size (int): 4 * hidden size. If you do not need peephole connections, please set use_peepholes to False .
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the hidden size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights.
- Weights = {:math:`W_{ch}, W_{ih}, \ This OP computes each timestep as follows:
W_{fh}, W_{oh}`}
- The shape is (D x 4D), where D is the hidden
size.
If it is set to None or one attribute of ParamAttr, .. math::
dynamic_lstm will create ParamAttr as param_attr. i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
If the Initializer of the param_attr is not set, the .. math::
parameter is initialized with Xavier. Default: None. f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
bias_attr (ParamAttr|None): The bias attribute for the learnable bias .. math::
o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
.. math::
\widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
.. math::
c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
.. math::
h_t = o_t \odot tanh(c_t)
The symbolic meanings in the formula are as follows:
- :math:`x_{t}` represents the input at timestep :math:`t`
- :math:`h_{t}` represents the hidden state at timestep :math:`t`
- :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
- :math:`\widetilde{c_t}` represents the candidate cell state
- :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
- :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
- :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
- :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
- :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
Parameters:
input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
size (int): must be 4 * hidden_size.
h_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
Data type is float32 or float64. If set to None, it will be a vector of all 0. Default: None.
c_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
Data type is float32 or float64. If set to None, it will be a vector of all 0. `h_0` and `c_0` can be None but only at the same time. Default: None.
param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
- Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [hidden_size, 4*hidden_size].
bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden weights, which contains two parts, input-hidden
bias weights and peephole connections weights if bias weights and peephole connections weights if
setting `use_peepholes` to `True`. setting `use_peepholes` to `True`.
Please refer to ref:`api_fluid_ParamAttr' . Default: None.
1. `use_peepholes = False` 1. `use_peepholes = False`
- Biases = {:math:`b_c, b_i, b_f, b_o`}. - Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D). - The shape is [1, 4*hidden_size].
2. `use_peepholes = True` 2. `use_peepholes = True`
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}. W_{fc}, W_{oc}`}.
- The shape is (1 x 7D). - The shape is [1, 7*hidden_size].
If it is set to None or one attribute of ParamAttr, use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True.
dynamic_lstm will create ParamAttr as bias_attr. is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False.
If the Initializer of the bias_attr is not set, gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid".
the bias is initialized zero. Default: None. cell_activation (str, optional): The activation for cell output. Default: "tanh".
use_peepholes (bool): ${use_peepholes_comment} candidate_activation (str, optional): The activation for candidate hidden state. Default: "tanh".
is_reverse (bool): ${is_reverse_comment} dtype (str, optional): Data type, can be "float32" or "float64". Default: "float32".
gate_activation (str): ${gate_activation_comment} name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
cell_activation (str): ${cell_activation_comment}
candidate_activation (str): ${candidate_activation_comment}
dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
tuple: The hidden state, and cell state of LSTM. The shape of both \ tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
is (T x D), and lod is the same with the `input`.
The hidden state and cell state of LSTM
- hidden: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
- cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -729,15 +752,16 @@ def dynamic_lstm(input, ...@@ -729,15 +752,16 @@ def dynamic_lstm(input,
vocab_size = 10000 vocab_size = 10000
hidden_dim = 512 hidden_dim = 512
data = fluid.layers.data(name='x', shape=[1], data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1)
dtype='int32', lod_level=1) emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4, forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
bias_attr=False) bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm( forward, cell = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False) input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
forward.shape # (-1, 512)
cell.shape # (-1, 512)
""" """
assert in_dygraph_mode( assert in_dygraph_mode(
) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!" ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
...@@ -799,77 +823,76 @@ def lstm(input, ...@@ -799,77 +823,76 @@ def lstm(input,
default_initializer=None, default_initializer=None,
seed=-1): seed=-1):
""" """
If Device is GPU, This op will use cudnn LSTM implementation **Note**:
This OP only supports running on GPU devices.
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
.. math::
i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
\\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) This OP implements LSTM operation - `Hochreiter, S., & Schmidhuber, J. (1997) <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`_ .
c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} The implementation of this OP does not include diagonal/peephole connections.
Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
If you need peephole connections, please use :ref:`api_fluid_layers_dynamic_lstm` .
h_t &= o_t \odot tanh(c_t) This OP computes each timestep as follows:
- $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix .. math::
of weights from the input gate to the input) i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). .. math::
- sigmoid is the logistic sigmoid function. f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
- $i, f, o$ and $c$ are the input gate, forget gate, output gate, .. math::
and cell activation vectors, respectively, all of which have the same size as o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
the cell output activation vector $h$. .. math::
- The :math:`\odot` is the element-wise product of the vectors. \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
- :math:`tanh` is the activation functions. .. math::
- :math:`\\tilde{c_t}` is also called candidate hidden state, c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
which is computed based on the current input and the previous hidden state. .. math::
h_t = o_t \odot tanh(c_t)
Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, The symbolic meanings in the formula are as follows:
X represensts a matrix multiplication
- :math:`x_{t}` represents the input at timestep :math:`t`
- :math:`h_{t}` represents the hidden state at timestep :math:`t`
- :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
- :math:`\widetilde{c_t}` represents the candidate cell state
- :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
- :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
- :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
- :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
- :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
Args: Parameters:
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
init_h(Variable): The initial hidden state of the LSTM init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
This is a tensor with shape ( num_layers x batch_size x hidden_size) If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
init_c(Variable): The initial cell state of the LSTM. If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
This is a tensor with shape ( num_layers x batch_size x hidden_size ) max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len.
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) hidden_size (int): hidden size of the LSTM.
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len num_layers (int): total layers number of the LSTM.
hidden_size (int): hidden size of the LSTM dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
num_layers (int): total layers number of the LSTM There is NO dropout work on rnn output of the last RNN layers.
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps Default: 0.0.
There is NO dropout work on rnn output of the last RNN layers is_bidirec (bool, optional): If it is bidirectional. Default: False.
is_bidirec (bool): If it is bidirectional is_test (bool, optional): If it is in test phrase. Default: False.
is_test (bool): If it is in test phrase name (str, optional): A name for this layer. If set None, the layer
name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None.
will be named automatically. default_initializer(Initializer, optional): Where use initializer to initialize the Weight
default_initializer(Initialize|None): Where use initializer to initialize the Weight If set None, defaule initializer will be used. Default: None.
If set None, defaule initializer will be used seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
Returns: Returns:
rnn_out(Tensor),last_h(Tensor),last_c(Tensor): tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ) :
Three tensors, rnn_out, last_h, last_c: Three tensors, rnn_out, last_h, last_c:
- rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ - rnn_out is result of LSTM hidden, shape is :math:`[seq\_len, batch\_size, hidden\_size]` \
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) if is_bidirec set to True, shape will be :math:`[seq\_len, batch\_size, hidden\_size*2]`
- last_h is the hidden state of the last step of LSTM \ - last_h is the hidden state of the last step of LSTM \
shape is ( num_layers x batch_size x hidden_size ) \ shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
- last_c(Tensor): the cell state of the last step of LSTM \ - last_c(Tensor): the cell state of the last step of LSTM \
shape is ( num_layers x batch_size x hidden_size ) \ shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
Examples: Examples:
...@@ -880,9 +903,8 @@ def lstm(input, ...@@ -880,9 +903,8 @@ def lstm(input,
emb_dim = 256 emb_dim = 256
vocab_size = 10000 vocab_size = 10000
data = fluid.layers.data(name='x', shape=[-1, 100, 1], data = fluid.data(name='x', shape=[None, 100], dtype='int64')
dtype='int32') emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
batch_size = 20 batch_size = 20
max_len = 100 max_len = 100
dropout_prob = 0.2 dropout_prob = 0.2
...@@ -894,6 +916,9 @@ def lstm(input, ...@@ -894,6 +916,9 @@ def lstm(input,
rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \ rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
max_len, hidden_size, num_layers, \ max_len, hidden_size, num_layers, \
dropout_prob=dropout_prob) dropout_prob=dropout_prob)
rnn_out.shape # (-1, 100, 150)
last_h.shape # (1, 20, 150)
last_c.shape # (1, 20, 150)
""" """
helper = LayerHelper('cudnn_lstm', **locals()) helper = LayerHelper('cudnn_lstm', **locals())
...@@ -978,138 +1003,102 @@ def dynamic_lstmp(input, ...@@ -978,138 +1003,102 @@ def dynamic_lstmp(input,
cell_clip=None, cell_clip=None,
proj_clip=None): proj_clip=None):
""" """
**Dynamic LSTMP Layer** **Note**:
1. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.
LSTMP (LSTM with recurrent projection) layer has a separate projection
layer after the LSTM layer, projecting the original hidden state to a
lower-dimensional one, which is proposed to reduce the number of total
parameters and furthermore computational complexity for the LSTM,
espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
The formula is as follows:
.. math::
i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
\\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} This OP implements the LSTMP (LSTM Projected) layer.
The LSTMP layer has a separate linear mapping layer behind the LSTM layer. -- `Sak, H., Senior, A., & Beaufays, F. (2014) <https://ai.google/research/pubs/pub43905.pdf>`_ .
h_t & = o_t \odot act_h(c_t) Compared with the standard LSTM layer, LSTMP has an additional linear mapping layer,
which is used to map from the original hidden state :math:`h_t` to the lower dimensional state :math:`r_t` .
This reduces the total number of parameters and computational complexity, especially when the output unit is relatively large.
r_t & = \overline{act_h}(W_{rh}h_t) The default implementation of the OP contains diagonal/peephole connections,
please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
If you need to disable the peephole connections, set use_peepholes to False.
In the above formula: This OP computes each timestep as follows:
* :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \ .. math::
the matrix of weights from the input gate to the input). i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
* :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \ .. math::
matrices for peephole connections. In our implementation, \ f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
we use vectors to represent these diagonal weight matrices. .. math::
* :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \ o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_{t-1} + b_o)
bias vector). .. math::
* :math:`\sigma`: The activation, such as logistic sigmoid function. \widetilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
* :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \ .. math::
gate, and cell activation vectors, respectively, all of which have \ c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
the same size as the cell output activation vector :math:`h`. .. math::
* :math:`h`: The hidden state. h_t = o_t \odot act_h(c_t)
* :math:`r`: The recurrent projection of the hidden state. .. math::
* :math:`\\tilde{c_t}`: The candidate hidden state, whose \ r_t = \overline{act_h}(W_{rh}h_t)
computation is based on the current input and previous hidden state.
* :math:`\odot`: The element-wise product of the vectors.
* :math:`act_g` and :math:`act_h`: The cell input and cell output \
activation functions and `tanh` is usually used for them.
* :math:`\overline{act_h}`: The activation function for the projection \
output, usually using `identity` or same as :math:`act_h`.
Set `use_peepholes` to `False` to disable peephole connection. The formula The symbolic meanings in the formula are as follows:
is omitted here, please refer to the paper
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}` - :math:`x_{t}` represents the input at timestep :math:`t`
operations on the input :math:`x_{t}` are NOT included in this operator. - :math:`h_{t}` represents the hidden state at timestep :math:`t`
Users can choose to use fully-connected layer before LSTMP layer. - :math:`r_{t}` : represents the state of the projected output of the hidden state :math:`h_{t}`
- :math:`h_{t-1}, c_{t-1}, r_{t-1}` represent the hidden state, cell state and projected output at timestep :math:`t-1` , respectively
- :math:`\widetilde{c_t}` represents the candidate cell state
- :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
- :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
- :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
- :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
- :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension
Args: Parameters:
input(Variable): The input of dynamic_lstmp layer, which supports input( :ref:`api_guide_Variable_en` ): The input of dynamic_lstmp layer, which supports
variable-time length input sequence. The underlying variable-time length input sequence.
tensor in this Variable is a matrix with shape It is a multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
(T X 4D), where T is the total time steps in this size(int): must be 4 * hidden_size.
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
proj_size(int): The size of projection output. proj_size(int): The size of projection output.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
hidden-hidden weight and projection weight. If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.
- Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \ - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [P, 4*hidden_size] , where P is the projection size.
W_{fh}, W_{oh}`}. - Projection weight = :math:`\{ W_{rh} \}` , the shape is [hidden_size, P].
- The shape of hidden-hidden weight is (P x 4D),
where P is the projection size and D the hidden
size.
- Projection weight = {:math:`W_{rh}`}.
- The shape of projection weight is (D x P).
If it is set to None or one attribute of ParamAttr, bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
dynamic_lstm will create ParamAttr as param_attr.
If the Initializer of the param_attr is not set, the
parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden weights, which contains two parts, input-hidden
bias weights and peephole connections weights if bias weights and peephole connections weights if
setting `use_peepholes` to `True`. setting `use_peepholes` to `True`.
Please refer to ref:`api_fluid_ParamAttr' . Default: None.
1. `use_peepholes = False` 1. `use_peepholes = False`
- Biases = {:math:`b_c, b_i, b_f, b_o`}. - Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D). - The shape is [1, 4*hidden_size].
2. `use_peepholes = True` 2. `use_peepholes = True`
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}. W_{fc}, W_{oc}`}.
- The shape is (1 x 7D). - The shape is [1, 7*hidden_size].
If it is set to None or one attribute of ParamAttr, use_peepholes (bool, optional): Whether to use peephole connection or not. Default True.
dynamic_lstm will create ParamAttr as bias_attr. is_reverse (bool, optional): Whether to calculate reverse LSTM. Default False.
If the Initializer of the bias_attr is not set, gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default "sigmoid".
the bias is initialized zero. Default: None. cell_activation (str, optional): The activation for cell output. Default "tanh".
use_peepholes(bool): Whether to enable diagonal/peephole connections, candidate_activation (str, optional): The activation for candidate hidden state. Default "tanh".
default `True`. proj_activation(str, optional): The activation for projection output. Default "tanh".
is_reverse(bool): Whether to compute reversed LSTM, default `False`. dtype (str, optional): Data type, can be "float32" or "float64". Default "float32".
gate_activation(str): The activation for input gate, forget gate and name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
output gate. Choices = ["sigmoid", "tanh", "relu", h_0( :ref:`api_guide_Variable` , optional): The initial hidden state is an optional input, default is zero.
"identity"], default "sigmoid". This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. Default: None.
cell_activation(str): The activation for cell output. Choices = ["sigmoid", c_0( :ref:`api_guide_Variable` , optional): The initial cell state is an optional input, default is zero.
"tanh", "relu", "identity"], default "tanh". This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size.
candidate_activation(str): The activation for candidate hidden state. `h_0` and `c_0` can be None but only at the same time. Default: None.
Choices = ["sigmoid", "tanh", "relu", "identity"], cell_clip(float, optional): If not None, the cell state is clipped
default "tanh". by this value prior to the cell output activation. Default: None.
proj_activation(str): The activation for projection output. proj_clip(float, optional): If `num_proj > 0` and `proj_clip` is
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the projection size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
cell_clip(float): If provided the cell state is clipped
by this value prior to the cell output activation.
proj_clip(float): If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`. `[-proj_clip, proj_clip]`. Default: None.
Returns: Returns:
tuple: A tuple of two output variable: the projection of hidden state, \ tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :
and cell state of LSTMP. The shape of projection is (T x P), \
for the cell state which is (T x D), and both LoD is the same \ The hidden state and cell state of LSTMP
with the `input`.
- hidden: LoDTensor with shape of :math:`[T, P]` , and its lod and dtype is the same as the input.
- cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
Examples: Examples:
...@@ -1117,19 +1106,20 @@ def dynamic_lstmp(input, ...@@ -1117,19 +1106,20 @@ def dynamic_lstmp(input,
import paddle.fluid as fluid import paddle.fluid as fluid
dict_dim, emb_dim = 128, 64 dict_dim, emb_dim = 128, 64
data = fluid.layers.data(name='sequence', shape=[1], data = fluid.data(name='sequence', shape=[None], dtype='int64', lod_level=1)
dtype='int32', lod_level=1) emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim, proj_dim = 512, 256 hidden_dim, proj_dim = 512, 256
fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4, fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
act=None, bias_attr=None) act=None, bias_attr=None)
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, proj_out, last_c = fluid.layers.dynamic_lstmp(input=fc_out,
size=hidden_dim * 4, size=hidden_dim * 4,
proj_size=proj_dim, proj_size=proj_dim,
use_peepholes=False, use_peepholes=False,
is_reverse=True, is_reverse=True,
cell_activation="tanh", cell_activation="tanh",
proj_activation="tanh") proj_activation="tanh")
proj_out.shape # (-1, 256)
last_c.shape # (-1, 512)
""" """
assert in_dygraph_mode( assert in_dygraph_mode(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册