未验证 提交 0311fd15 编写于 作者: Y Yibing Liu 提交者: GitHub

Merge pull request #7888 from kuke/add_lstmp_doc

Add python api for lstmp operator
......@@ -18,6 +18,11 @@ dynamic_lstm
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
dynamic_lstmp
-------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
......
......@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
......
......@@ -26,6 +26,7 @@ __all__ = [
'fc',
'embedding',
'dynamic_lstm',
'dynamic_lstmp',
'dynamic_gru',
'gru_unit',
'linear_chain_crf',
......@@ -256,7 +257,8 @@ def dynamic_lstm(input,
gate_activation='sigmoid',
cell_activation='tanh',
candidate_activation='tanh',
dtype='float32'):
dtype='float32',
name=None):
"""
**Dynamic LSTM Layer**
......@@ -282,7 +284,7 @@ def dynamic_lstm(input,
W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
our implementation, we use vectors to reprenset these diagonal weight
matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
gate bias vector), :math:`\sigma` is the non-line activations, such as
gate bias vector), :math:`\sigma` is the non-linear activations, such as
logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
gate, forget gate, output gate, and cell activation vectors, respectively,
all of which have the same size as the cell output activation vector :math:`h`.
......@@ -308,25 +310,25 @@ def dynamic_lstm(input,
(T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
param_attr(ParamAttr): The parameter attribute for the learnable
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights.
- The shape is (D x 4D), where D is the hidden
size.
- Weights = {:math:`W_{ch}, W_{ih}, \
W_{fh}, W_{oh}`}
bias_attr(ParamAttr): The bias attribute for the learnable bias
- The shape is (D x 4D), where D is the hidden
size.
bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden
bias weights and peephole connections weights if
setting `use_peepholes` to `True`.
1. `use_peepholes = False`
- The shape is (1 x 4D).
- Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D).
2. `use_peepholes = True`
- The shape is (1 x 7D).
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}.
- The shape is (1 x 7D).
use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`.
......@@ -339,6 +341,8 @@ def dynamic_lstm(input,
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
tuple: The hidden state, and cell state of LSTM. The shape of both \
......@@ -353,6 +357,7 @@ def dynamic_lstm(input,
forward, _ = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
"""
helper = LayerHelper('lstm', **locals())
size = size / 4
weight = helper.create_parameter(
......@@ -389,6 +394,192 @@ def dynamic_lstm(input,
return hidden, cell
def dynamic_lstmp(input,
size,
proj_size,
param_attr=None,
bias_attr=None,
use_peepholes=True,
is_reverse=False,
gate_activation='sigmoid',
cell_activation='tanh',
candidate_activation='tanh',
proj_activation='tanh',
dtype='float32',
name=None):
"""
**Dynamic LSTMP Layer**
LSTMP (LSTM with recurrent projection) layer has a separate projection
layer after the LSTM layer, projecting the original hidden state to a
lower-dimensional one, which is proposed to reduce the number of total
parameters and furthermore computational complexity for the LSTM,
espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
The formula is as follows:
.. math::
i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
\\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
h_t & = o_t \odot act_h(c_t)
r_t & = \overline{act_h}(W_{rh}h_t)
In the above formula:
* :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
the matrix of weights from the input gate to the input).
* :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
matrices for peephole connections. In our implementation, \
we use vectors to reprenset these diagonal weight matrices.
* :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
bias vector).
* :math:`\sigma`: The activation, such as logistic sigmoid function.
* :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
gate, and cell activation vectors, respectively, all of which have \
the same size as the cell output activation vector :math:`h`.
* :math:`h`: The hidden state.
* :math:`r`: The recurrent projection of the hidden state.
* :math:`\\tilde{c_t}`: The candidate hidden state, whose \
computation is based on the current input and previous hidden state.
* :math:`\odot`: The element-wise product of the vectors.
* :math:`act_g` and :math:`act_h`: The cell input and cell output \
activation functions and `tanh` is usually used for them.
* :math:`\overline{act_h}`: The activation function for the projection \
output, usually using `identity` or same as :math:`act_h`.
Set `use_peepholes` to `False` to disable peephole connection. The formula
is omitted here, please refer to the paper
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
operations on the input :math:`x_{t}` are NOT included in this operator.
Users can choose to use fully-connected layer before LSTMP layer.
Args:
input(Variable): The input of dynamic_lstmp layer, which supports
variable-time length input sequence. The underlying
tensor in this Variable is a matrix with shape
(T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
proj_size(int): The size of projection output.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight and projection weight.
- Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
W_{fh}, W_{oh}`}.
- The shape of hidden-hidden weight is (P x 4D),
where P is the projection size and D the hidden
size.
- Projection weight = {:math:`W_{rh}`}.
- The shape of projection weight is (D x P).
bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden
bias weights and peephole connections weights if
setting `use_peepholes` to `True`.
1. `use_peepholes = False`
- Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D).
2. `use_peepholes = True`
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}.
- The shape is (1 x 7D).
use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`.
gate_activation(str): The activation for input gate, forget gate and
output gate. Choices = ["sigmoid", "tanh", "relu",
"identity"], default "sigmoid".
cell_activation(str): The activation for cell output. Choices = ["sigmoid",
"tanh", "relu", "identity"], default "tanh".
candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
proj_activation(str): The activation for projection output.
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
tuple: The projection of hidden state, and cell state of LSTMP. The \
shape of projection is (T x P), for the cell state which is \
(T x D), and both LoD is the same with the `input`.
Examples:
.. code-block:: python
hidden_dim, proj_dim = 512, 256
fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
act=None, bias_attr=None)
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
size=hidden_dim * 4,
proj_size=proj_dim,
use_peepholes=False,
is_reverse=True,
cell_activation="tanh",
proj_activation="tanh")
"""
helper = LayerHelper('lstmp', **locals())
size = size / 4
weight = helper.create_parameter(
attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
proj_weight = helper.create_parameter(
attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
bias_size = [1, 7 * size]
if not use_peepholes:
bias_size[1] = 4 * size
bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
projection = helper.create_tmp_variable(dtype)
cell = helper.create_tmp_variable(dtype)
ordered_proj0 = helper.create_tmp_variable(dtype)
batch_hidden = helper.create_tmp_variable(dtype)
batch_gate = helper.create_tmp_variable(dtype)
batch_cell_pre_act = helper.create_tmp_variable(dtype)
helper.append_op(
type='lstmp',
inputs={
'Input': input,
'Weight': weight,
'ProjWeight': proj_weight,
'Bias': bias
},
outputs={
'Projection': projection,
'Cell': cell,
'OrderedP0': ordered_proj0,
'BatchHidden': batch_hidden,
'BatchGate': batch_gate,
'BatchCellPreAct': batch_cell_pre_act
},
attrs={
'use_peepholes': use_peepholes,
'is_reverse': is_reverse,
'gate_activation': gate_activation,
'cell_activation': cell_activation,
'candidate_activation': candidate_activation,
'proj_activation': proj_activation
})
return projection, cell
def dynamic_gru(input,
size,
param_attr=None,
......
......@@ -202,6 +202,18 @@ class TestBook(unittest.TestCase):
x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
print(str(program))
def test_dynamic_lstmp(self):
program = Program()
with program_guard(program):
hidden_dim, proj_dim = 16, 8
seq_data = layers.data(
name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
self.assertIsNotNone(
layers.dynamic_lstmp(
input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
print(str(program))
def test_sequence_softmax(self):
program = Program()
with program_guard(program):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册