diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 550b0e5b82609750ccd318eee889313cb2d7925a..f873c93d9a3424497c089fa7ee44122856090610 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -18,6 +18,11 @@ dynamic_lstm .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm :noindex: +dynamic_gru +----------- +.. autofunction:: paddle.v2.fluid.layers.dynamic_gru + :noindex: + data ---- .. autofunction:: paddle.v2.fluid.layers.data diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 1fd6ba4b9c4d6e32c2c4115ccda6e9657862b04e..930cd742bbdfdf193e88af713647778efe8c4de5 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -26,6 +26,7 @@ __all__ = [ 'fc', 'embedding', 'dynamic_lstm', + 'dynamic_gru', 'gru_unit', 'linear_chain_crf', 'crf_decoding', @@ -368,6 +369,113 @@ def dynamic_lstm(input, return hidden, cell +def dynamic_gru(input, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None): + """ + **Dynamic GRU Layer** + + Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on + Sequence Modeling `_ + + The formula is as follows: + + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) + + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + + h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} + + The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` + is the update gate and reset gate activation function and :math:`sigmoid` + is usually used for it. :math:`act_c` is the activation function for + candidate hidden state and :math:`tanh` is usually used for it. + + Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on + the input :math:`x_{t}` are NOT included in this operator. Users can choose + to use fully-connect layer before GRU layer. + + Args: + input(Variable): The input of dynamic_gru layer, which supports + variable-time length input sequence. The underlying tensor in this + Variable is a matrix with shape :math:`(T \\times 3D)`, where + :math:`T` is the total time steps in this mini-batch, :math:`D` + is the hidden size. + size(int): The dimension of the gru cell. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + bias_attr(ParamAttr): The parameter attribute for learnable the + hidden-hidden bias. + is_reverse(bool): Whether to compute reversed GRU, default + :attr:`False`. + gate_activation(str): The activation for update gate and reset gate. + Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". + activation(str): The activation for candidate hidden state. + Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". + + Returns: + Variable: The hidden state of GRU. The shape is (T \\times D), and lod \ + is the same with the input. + + Examples: + .. code-block:: python + + hidden_dim = 512 + x = fluid.layers.fc(input=data, size=hidden_dim * 3) + hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) + """ + + helper = LayerHelper('gru', **locals()) + dtype = helper.input_dtype() + + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + bias = helper.create_parameter( + attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + if h_0 != None: + assert h_0.shape == ( + size, size), 'The shape of h0 should be(%d, %d)' % (size, size) + inputs['h0'] = h_0 + + hidden = helper.create_tmp_variable(dtype) + batch_gate = helper.create_tmp_variable(dtype) + batch_reset_hidden_prev = helper.create_tmp_variable(dtype) + batch_hidden = helper.create_tmp_variable(dtype) + + helper.append_op( + type='gru', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'BatchGate': batch_gate, + 'BatchResetHiddenPrev': batch_reset_hidden_prev, + 'BatchHidden': batch_hidden + }, + attrs={ + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'activation': candidate_activation + }) + return hidden + + def gru_unit(input, hidden, size,