diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 07e11d5c15c565bf3af5146dad0c5effbb034e08..8b8588f139fed72f90f4812eb2e119ff638a8415 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -68,7 +68,7 @@ paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) -paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None)) +paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)) paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)) paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc index 6d91d0d5c0dad14c84dcddbfdfcf4edb82db7687..e3beedcf10b6286c92371c48cae7912aef35e7a3 100644 --- a/paddle/fluid/operators/gru_unit_op.cc +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -113,7 +113,10 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { .InEnum({identity, sigmoid, tanh, relu}); AddAttr("origin_mode", "bool" - "use origin mode in article https://arxiv.org/abs/1412.3555") + "use origin mode in article (https://arxiv.org/pdf/1406.1078.pdf)") .SetDefault(false); AddComment(R"DOC( GRUUnit Operator implements partial calculations of the GRU unit as following: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a3e29a182b0d05b2ce5af992e3427b98c1c90e44..3352ff583154668dfb0e8cb7f2726868a6798f83 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -864,12 +864,14 @@ def dynamic_gru(input, is_reverse=False, gate_activation='sigmoid', candidate_activation='tanh', - h_0=None): + h_0=None, + origin_mode=False): """ **Gated Recurrent Unit (GRU) Layer** - Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on - Sequence Modeling `_ . + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ . The formula is as follows: @@ -883,6 +885,20 @@ def dynamic_gru(input, h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} + if origin_mode is True, then the equation is from paper + `Learning Phrase Representations using RNN Encoder–Decoder for Statistical + Machine Translation `_ + + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) + + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` is the update gate and reset gate activation function and :math:`sigmoid` is usually used for it. :math:`act_c` is the activation function for @@ -980,7 +996,8 @@ def dynamic_gru(input, attrs={ 'is_reverse': is_reverse, 'gate_activation': gate_activation, - 'activation': candidate_activation + 'activation': candidate_activation, + 'origin_mode': origin_mode }) return hidden @@ -994,7 +1011,11 @@ def gru_unit(input, gate_activation='sigmoid', origin_mode=False): """ - GRU unit layer. The equation of a gru step is: + **GRU unit layer** + + if origin_mode is True, then the equation of a gru step is from paper + `Learning Phrase Representations using RNN Encoder–Decoder for Statistical + Machine Translation `_ .. math:: u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) @@ -1003,7 +1024,21 @@ def gru_unit(input, m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) - h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1}) + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms of the equation above, the :math:`z_t` is split into 3 parts -