提交 4545a058 编写于 作者: R ranqiu

add dot-product attention

上级 8e2cc754
......@@ -125,3 +125,8 @@ simple_attention
:members: simple_attention
:noindex:
dot_product_attention
---------------------
.. automodule:: paddle.v2.networks
:members: dot_product_attention
:noindex:
......@@ -26,8 +26,9 @@ __all__ = [
'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
"img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
'bidirectional_lstm', 'inputs', 'outputs'
'simple_attention', 'dot_product_attention', 'simple_gru2',
'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
'outputs'
]
######################################################
......@@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
compute attention weight.
:type transform_param_attr: ParameterAttribute
:return: a context vector
:rtype: LayerOutput
"""
assert encoded_proj.size == decoder_state.size
proj_size = encoded_proj.size
......@@ -1396,6 +1398,85 @@ def simple_attention(encoded_sequence,
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
@wrap_name_default()
def dot_product_attention(encoded_sequence,
attending_sequence,
transformed_state,
softmax_param_attr=None,
name=None):
"""
Calculate and return a context vector with dot-product attention mechanism.
Size of the context vector equals to size of the attending_sequence.
.. math::
a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
e_{i,j} & = a(s_{i-1}, h_{j})
a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
where :math:`h_{j}` is the jth element of encoded_sequence,
:math:`z_{j}` is the jth element of attending_sequence,
:math:`s_{i-1}` is transformed_state
The example usage is:
.. code-block:: python
context = dot_product_attention(encoded_sequence=enc_seq,
attending_sequence=att_seq,
transformed_state=state,)
:param name: name of the dot-product attention model.
:type name: basestring
:param softmax_param_attr: parameter attribute of sequence softmax
that is used to produce attention weight.
:type softmax_param_attr: ParameterAttribute
:param encoded_sequence: output of the encoder
:type encoded_sequence: LayerOutput
:param attending_sequence: attention weight is computed by a feed forward neural
network which has two inputs : decoder's transformed
hidden state of previous time step and encoder's output.
attending_sequence is the sequence to be attended.
:type attending_sequence: LayerOutput
:param transformed_state: transformed hidden state of decoder in previous time step,
its size should equal to encoded_sequence's. Here we do the
transformation outside dot_product_attention for flexibility
consideration.
:type transformed_state: LayerOutput
:return: a context vector
:rtype: LayerOutput
"""
assert transformed_state.size == encoded_sequence.size
expanded = expand_layer(
input=transformed_state,
expanded_as=encoded_sequence,
name='%s_expand' % name)
m = linear_comb_layer(
weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
attention_weight = fc_layer(
input=m,
size=1,
act=SequenceSoftmaxActivation(),
param_attr=softmax_param_attr,
name="%s_softmax" % name,
bias_attr=False)
scaled = scaling_layer(
weight=attention_weight,
input=attending_sequence,
name='%s_scaling' % name)
return pooling_layer(
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
def inputs(layers, *args):
"""
Declare the inputs of network. The order of input should be as same as
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册