diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 6c5961cc63d1c140e0a6f33aac054acdbbe8e8e0..bad2c325be3524443c21981c59a6a224efcc01b5 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,7 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant -__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding'] +__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] class Conv2D(layers.Layer): @@ -496,3 +496,138 @@ class Embedding(layers.Layer): }) return out + + +class GRUUnit(layers.Layer): + """ + **GRU unit layer** + + if origin_mode is True, then the equation of a gru step is from paper + `Learning Phrase Representations using RNN Encoder-Decoder for Statistical + Machine Translation `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + + + The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms + of the equation above, the :math:`z_t` is split into 3 parts - + :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to + implement a full GRU unit operator for an input, a fully + connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. + + The terms :math:`u_t` and :math:`r_t` represent the update and reset gates + of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is + an intermediate candidate hidden output, which is denoted by :math:`m_t`. + This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})` + and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`. + + Args: + input (Variable): The fc transformed input value of current step. + hidden (Variable): The hidden value of gru unit from previous step. + size (integer): The input dimension value. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, gru_unit will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. + activation (string): The activation type for cell (actNode). + Default: 'tanh' + gate_activation (string): The activation type for gates (actGate). + Default: 'sigmoid' + + Returns: + tuple: The hidden value, reset-hidden value and gate values. + """ + + def __init__(self, + hidden, + size, + param_attr=None, + bias_attr=None, + activation='tanh', + gate_activation='sigmoid', + origin_mode=False, + dtype='float32'): + + super(GRUUnit, self).__init__() + activation_dict = dict( + identity=0, + sigmoid=1, + tanh=2, + relu=3, ) + activation = activation_dict[activation] + gate_activation = activation_dict[gate_activation] + + helper = LayerHelper('gru_unit', **locals()) + dtype = helper.input_dtype() + size = size // 3 + + # create weight + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + + gate = helper.create_variable_for_type_inference(dtype) + reset_hidden_pre = helper.create_variable_for_type_inference(dtype) + updated_hidden = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} + # create bias + if helper.bias_attr: + bias_size = [1, 3 * size] + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=bias_size, + dtype=dtype, + is_bias=True) + inputs['Bias'] = bias + + def forward(self, input): + self._helper.append_op( + type='gru_unit', + inputs=inputs, + outputs={ + 'Gate': gate, + 'ResetHiddenPrev': reset_hidden_pre, + 'Hidden': updated_hidden, + }, + attrs={ + 'activation': 2, # tanh + 'gate_activation': 1, # sigmoid + }) + + return updated_hidden, reset_hidden_pre, gate