diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index f738bf15641d9fca0bfb0c10821de778ceee0d79..231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -18,6 +18,11 @@ dynamic_lstm .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm :noindex: +dynamic_lstmp +------------- +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp + :noindex: + dynamic_gru ----------- .. autofunction:: paddle.v2.fluid.layers.dynamic_gru diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 15f7cb6b560590f55e276fde4900d2e3c0045fb8..48cf5816cce4bb5ee8e66e72c5b1acea8535ab10 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) +op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index bae33f6a156b094d6e9cccc9cabb42880f157ac8..d11dccfd22124d58d8634c01a00527c373b92f00 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -26,6 +26,7 @@ __all__ = [ 'fc', 'embedding', 'dynamic_lstm', + 'dynamic_lstmp', 'dynamic_gru', 'gru_unit', 'linear_chain_crf', @@ -256,7 +257,8 @@ def dynamic_lstm(input, gate_activation='sigmoid', cell_activation='tanh', candidate_activation='tanh', - dtype='float32'): + dtype='float32', + name=None): """ **Dynamic LSTM Layer** @@ -282,7 +284,7 @@ def dynamic_lstm(input, W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In our implementation, we use vectors to reprenset these diagonal weight matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input - gate bias vector), :math:`\sigma` is the non-line activations, such as + gate bias vector), :math:`\sigma` is the non-linear activations, such as logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input gate, forget gate, output gate, and cell activation vectors, respectively, all of which have the same size as the cell output activation vector :math:`h`. @@ -308,25 +310,25 @@ def dynamic_lstm(input, (T X 4D), where T is the total time steps in this mini-batch, D is the hidden size. size(int): 4 * hidden size. - param_attr(ParamAttr): The parameter attribute for the learnable + param_attr(ParamAttr|None): The parameter attribute for the learnable hidden-hidden weights. - - The shape is (D x 4D), where D is the hidden - size. - Weights = {:math:`W_{ch}, W_{ih}, \ W_{fh}, W_{oh}`} - bias_attr(ParamAttr): The bias attribute for the learnable bias + - The shape is (D x 4D), where D is the hidden + size. + bias_attr(ParamAttr|None): The bias attribute for the learnable bias weights, which contains two parts, input-hidden bias weights and peephole connections weights if setting `use_peepholes` to `True`. 1. `use_peepholes = False` - - The shape is (1 x 4D). - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). 2. `use_peepholes = True` - - The shape is (1 x 7D). - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). use_peepholes(bool): Whether to enable diagonal/peephole connections, default `True`. is_reverse(bool): Whether to compute reversed LSTM, default `False`. @@ -339,6 +341,8 @@ def dynamic_lstm(input, Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". dtype(str): Data type. Choices = ["float32", "float64"], default "float32". + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: tuple: The hidden state, and cell state of LSTM. The shape of both \ @@ -353,6 +357,7 @@ def dynamic_lstm(input, forward, _ = fluid.layers.dynamic_lstm( input=forward_proj, size=hidden_dim * 4, use_peepholes=False) """ + helper = LayerHelper('lstm', **locals()) size = size / 4 weight = helper.create_parameter( @@ -389,6 +394,192 @@ def dynamic_lstm(input, return hidden, cell +def dynamic_lstmp(input, + size, + proj_size, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + proj_activation='tanh', + dtype='float32', + name=None): + """ + **Dynamic LSTMP Layer** + + LSTMP (LSTM with recurrent projection) layer has a separate projection + layer after the LSTM layer, projecting the original hidden state to a + lower-dimensional one, which is proposed to reduce the number of total + parameters and furthermore computational complexity for the LSTM, + espeacially for the case that the size of output units is relative + large (https://research.google.com/pubs/archive/43905.pdf). + + The formula is as follows: + + .. math:: + + i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) + + f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) + + \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) + + o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) + + c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + + h_t & = o_t \odot act_h(c_t) + + r_t & = \overline{act_h}(W_{rh}h_t) + + In the above formula: + + * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \ + the matrix of weights from the input gate to the input). + * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \ + matrices for peephole connections. In our implementation, \ + we use vectors to reprenset these diagonal weight matrices. + * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \ + bias vector). + * :math:`\sigma`: The activation, such as logistic sigmoid function. + * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \ + gate, and cell activation vectors, respectively, all of which have \ + the same size as the cell output activation vector :math:`h`. + * :math:`h`: The hidden state. + * :math:`r`: The recurrent projection of the hidden state. + * :math:`\\tilde{c_t}`: The candidate hidden state, whose \ + computation is based on the current input and previous hidden state. + * :math:`\odot`: The element-wise product of the vectors. + * :math:`act_g` and :math:`act_h`: The cell input and cell output \ + activation functions and `tanh` is usually used for them. + * :math:`\overline{act_h}`: The activation function for the projection \ + output, usually using `identity` or same as :math:`act_h`. + + Set `use_peepholes` to `False` to disable peephole connection. The formula + is omitted here, please refer to the paper + http://www.bioinf.jku.at/publications/older/2604.pdf for details. + + Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}` + operations on the input :math:`x_{t}` are NOT included in this operator. + Users can choose to use fully-connected layer before LSTMP layer. + + Args: + input(Variable): The input of dynamic_lstmp layer, which supports + variable-time length input sequence. The underlying + tensor in this Variable is a matrix with shape + (T X 4D), where T is the total time steps in this + mini-batch, D is the hidden size. + size(int): 4 * hidden size. + proj_size(int): The size of projection output. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight and projection weight. + + - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \ + W_{fh}, W_{oh}`}. + - The shape of hidden-hidden weight is (P x 4D), + where P is the projection size and D the hidden + size. + - Projection weight = {:math:`W_{rh}`}. + - The shape of projection weight is (D x P). + bias_attr(ParamAttr|None): The bias attribute for the learnable bias + weights, which contains two parts, input-hidden + bias weights and peephole connections weights if + setting `use_peepholes` to `True`. + + 1. `use_peepholes = False` + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). + 2. `use_peepholes = True` + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). + use_peepholes(bool): Whether to enable diagonal/peephole connections, + default `True`. + is_reverse(bool): Whether to compute reversed LSTM, default `False`. + gate_activation(str): The activation for input gate, forget gate and + output gate. Choices = ["sigmoid", "tanh", "relu", + "identity"], default "sigmoid". + cell_activation(str): The activation for cell output. Choices = ["sigmoid", + "tanh", "relu", "identity"], default "tanh". + candidate_activation(str): The activation for candidate hidden state. + Choices = ["sigmoid", "tanh", "relu", "identity"], + default "tanh". + proj_activation(str): The activation for projection output. + Choices = ["sigmoid", "tanh", "relu", "identity"], + default "tanh". + dtype(str): Data type. Choices = ["float32", "float64"], default "float32". + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + tuple: The projection of hidden state, and cell state of LSTMP. The \ + shape of projection is (T x P), for the cell state which is \ + (T x D), and both LoD is the same with the `input`. + + Examples: + .. code-block:: python + + hidden_dim, proj_dim = 512, 256 + fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + act=None, bias_attr=None) + proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, + size=hidden_dim * 4, + proj_size=proj_dim, + use_peepholes=False, + is_reverse=True, + cell_activation="tanh", + proj_activation="tanh") + """ + + helper = LayerHelper('lstmp', **locals()) + size = size / 4 + weight = helper.create_parameter( + attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype) + proj_weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, proj_size], dtype=dtype) + bias_size = [1, 7 * size] + if not use_peepholes: + bias_size[1] = 4 * size + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + + projection = helper.create_tmp_variable(dtype) + cell = helper.create_tmp_variable(dtype) + ordered_proj0 = helper.create_tmp_variable(dtype) + batch_hidden = helper.create_tmp_variable(dtype) + batch_gate = helper.create_tmp_variable(dtype) + batch_cell_pre_act = helper.create_tmp_variable(dtype) + + helper.append_op( + type='lstmp', + inputs={ + 'Input': input, + 'Weight': weight, + 'ProjWeight': proj_weight, + 'Bias': bias + }, + outputs={ + 'Projection': projection, + 'Cell': cell, + 'OrderedP0': ordered_proj0, + 'BatchHidden': batch_hidden, + 'BatchGate': batch_gate, + 'BatchCellPreAct': batch_cell_pre_act + }, + attrs={ + 'use_peepholes': use_peepholes, + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'cell_activation': cell_activation, + 'candidate_activation': candidate_activation, + 'proj_activation': proj_activation + }) + return projection, cell + + def dynamic_gru(input, size, param_attr=None, diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 4e863625422c93c77ad4fb65be35580943d1cf54..3f54e28defb76d3430a82e791578e20b84833f16 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -202,6 +202,18 @@ class TestBook(unittest.TestCase): x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell)) print(str(program)) + def test_dynamic_lstmp(self): + program = Program() + with program_guard(program): + hidden_dim, proj_dim = 16, 8 + seq_data = layers.data( + name='seq_data', shape=[10, 10], dtype='float32', lod_level=1) + fc_out = layers.fc(input=seq_data, size=4 * hidden_dim) + self.assertIsNotNone( + layers.dynamic_lstmp( + input=fc_out, size=4 * hidden_dim, proj_size=proj_dim)) + print(str(program)) + def test_sequence_softmax(self): program = Program() with program_guard(program):