diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index 6e813ab1a820d068ea3e54cad6178f1cf928eadc..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,3 +125,8 @@ simple_attention :members: simple_attention :noindex: +dot_product_attention +--------------------- +.. automodule:: paddle.v2.networks + :members: dot_product_attention + :noindex: diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 93e8ac173e721d9623fce91f30ac4642d273caba..0ecbacb7bbc302449faed86665d6eb38e65f48e7 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -26,8 +26,9 @@ __all__ = [ 'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', - 'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool', - 'bidirectional_lstm', 'inputs', 'outputs' + 'simple_attention', 'dot_product_attention', 'simple_gru2', + 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs', + 'outputs' ] ###################################################### @@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence, compute attention weight. :type transform_param_attr: ParameterAttribute :return: a context vector + :rtype: LayerOutput """ assert encoded_proj.size == decoder_state.size proj_size = encoded_proj.size @@ -1396,6 +1398,85 @@ def simple_attention(encoded_sequence, input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) +@wrap_name_default() +def dot_product_attention(encoded_sequence, + attending_sequence, + transformed_state, + softmax_param_attr=None, + name=None): + """ + Calculate and return a context vector with dot-product attention mechanism. + Size of the context vector equals to size of the attending_sequence. + + .. math:: + + a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j} + + e_{i,j} & = a(s_{i-1}, h_{j}) + + a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}} + + c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j} + + where :math:`h_{j}` is the jth element of encoded_sequence, + :math:`z_{j}` is the jth element of attending_sequence, + :math:`s_{i-1}` is transformed_state + + The example usage is: + + .. code-block:: python + + context = dot_product_attention(encoded_sequence=enc_seq, + attending_sequence=att_seq, + transformed_state=state,) + + :param name: name of the dot-product attention model. + :type name: basestring + :param softmax_param_attr: parameter attribute of sequence softmax + that is used to produce attention weight. + :type softmax_param_attr: ParameterAttribute + :param encoded_sequence: output of the encoder + :type encoded_sequence: LayerOutput + :param attending_sequence: attention weight is computed by a feed forward neural + network which has two inputs : decoder's transformed + hidden state of previous time step and encoder's output. + attending_sequence is the sequence to be attended. + :type attending_sequence: LayerOutput + :param transformed_state: transformed hidden state of decoder in previous time step, + its size should equal to encoded_sequence's. Here we do the + transformation outside dot_product_attention for flexibility + consideration. + :type transformed_state: LayerOutput + :return: a context vector + :rtype: LayerOutput + """ + assert transformed_state.size == encoded_sequence.size + + expanded = expand_layer( + input=transformed_state, + expanded_as=encoded_sequence, + name='%s_expand' % name) + + m = linear_comb_layer( + weights=expanded, vectors=encoded_sequence, name='%s_dot-product') + + attention_weight = fc_layer( + input=m, + size=1, + act=SequenceSoftmaxActivation(), + param_attr=softmax_param_attr, + name="%s_softmax" % name, + bias_attr=False) + + scaled = scaling_layer( + weight=attention_weight, + input=attending_sequence, + name='%s_scaling' % name) + + return pooling_layer( + input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) + + def inputs(layers, *args): """ Declare the inputs of network. The order of input should be as same as