提交 4545a058 编写于 作者: R ranqiu

add dot-product attention

上级 8e2cc754
...@@ -125,3 +125,8 @@ simple_attention ...@@ -125,3 +125,8 @@ simple_attention
:members: simple_attention :members: simple_attention
:noindex: :noindex:
dot_product_attention
---------------------
.. automodule:: paddle.v2.networks
:members: dot_product_attention
:noindex:
...@@ -26,8 +26,9 @@ __all__ = [ ...@@ -26,8 +26,9 @@ __all__ = [
'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", 'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
"img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg', "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'simple_attention', 'dot_product_attention', 'simple_gru2',
'bidirectional_lstm', 'inputs', 'outputs' 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
'outputs'
] ]
###################################################### ######################################################
...@@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence, ...@@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
compute attention weight. compute attention weight.
:type transform_param_attr: ParameterAttribute :type transform_param_attr: ParameterAttribute
:return: a context vector :return: a context vector
:rtype: LayerOutput
""" """
assert encoded_proj.size == decoder_state.size assert encoded_proj.size == decoder_state.size
proj_size = encoded_proj.size proj_size = encoded_proj.size
...@@ -1396,6 +1398,85 @@ def simple_attention(encoded_sequence, ...@@ -1396,6 +1398,85 @@ def simple_attention(encoded_sequence,
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
@wrap_name_default()
def dot_product_attention(encoded_sequence,
attending_sequence,
transformed_state,
softmax_param_attr=None,
name=None):
"""
Calculate and return a context vector with dot-product attention mechanism.
Size of the context vector equals to size of the attending_sequence.
.. math::
a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
e_{i,j} & = a(s_{i-1}, h_{j})
a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
where :math:`h_{j}` is the jth element of encoded_sequence,
:math:`z_{j}` is the jth element of attending_sequence,
:math:`s_{i-1}` is transformed_state
The example usage is:
.. code-block:: python
context = dot_product_attention(encoded_sequence=enc_seq,
attending_sequence=att_seq,
transformed_state=state,)
:param name: name of the dot-product attention model.
:type name: basestring
:param softmax_param_attr: parameter attribute of sequence softmax
that is used to produce attention weight.
:type softmax_param_attr: ParameterAttribute
:param encoded_sequence: output of the encoder
:type encoded_sequence: LayerOutput
:param attending_sequence: attention weight is computed by a feed forward neural
network which has two inputs : decoder's transformed
hidden state of previous time step and encoder's output.
attending_sequence is the sequence to be attended.
:type attending_sequence: LayerOutput
:param transformed_state: transformed hidden state of decoder in previous time step,
its size should equal to encoded_sequence's. Here we do the
transformation outside dot_product_attention for flexibility
consideration.
:type transformed_state: LayerOutput
:return: a context vector
:rtype: LayerOutput
"""
assert transformed_state.size == encoded_sequence.size
expanded = expand_layer(
input=transformed_state,
expanded_as=encoded_sequence,
name='%s_expand' % name)
m = linear_comb_layer(
weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
attention_weight = fc_layer(
input=m,
size=1,
act=SequenceSoftmaxActivation(),
param_attr=softmax_param_attr,
name="%s_softmax" % name,
bias_attr=False)
scaled = scaling_layer(
weight=attention_weight,
input=attending_sequence,
name='%s_scaling' % name)
return pooling_layer(
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
def inputs(layers, *args): def inputs(layers, *args):
""" """
Declare the inputs of network. The order of input should be as same as Declare the inputs of network. The order of input should be as same as
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册