提交 9bcb2d26 编写于 作者: G guosheng

Add python wrapper for matmul_op and dot_product_attention

上级 234013a9
...@@ -364,6 +364,12 @@ split ...@@ -364,6 +364,12 @@ split
.. autofunction:: paddle.v2.fluid.layers.split .. autofunction:: paddle.v2.fluid.layers.split
:noindex: :noindex:
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid .. autofunction:: paddle.v2.fluid.layers.logsigmoid
......
...@@ -25,3 +25,9 @@ glu ...@@ -25,3 +25,9 @@ glu
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
:noindex:
...@@ -37,6 +37,7 @@ __all__ = [ ...@@ -37,6 +37,7 @@ __all__ = [
'sequence_last_step', 'sequence_last_step',
'dropout', 'dropout',
'split', 'split',
'matmul',
] ]
...@@ -1586,83 +1587,71 @@ def split(input, num_or_sections, dim=-1): ...@@ -1586,83 +1587,71 @@ def split(input, num_or_sections, dim=-1):
return outs return outs
def matmul(x, y): def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
""" """
Applies matrix multipication to two tensors. Applies matrix multipication to two tensors. Currently only rank 1 to rank
3 input tensors are supported.
This operator is used to perform (batched) matrix multiplication The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
over the last two dimensions of the input tensors `X` and `Y`. flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
If a transpose flag is specified, the last two dimensions of the - If a transpose flag is specified, the last two dimensions of the tensor
tensor are transposed. If the tensor is rank-1 of shape [D], then are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
for `X` it is treated as [1, D] in nontransposed form and as [D, 1] :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
in transposed form, whereas for `Y` it is the opposite: It is treated :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
as [D, 1] in nontransposed form and as [1, D] in transposed form. opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
:math:`[1, D]` in transposed form.
Examples without transpose: - After transpose, the two tensors are 2-D or 3-D and matrix multipication
- X: [K], Y: [K] => Out: [1] performs in the following way.
- X: [K], Y: [K, N] => Out: [N]
- X: [B, M, K], Y: [K] => Out: [B, M]
- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
The behavior is designed to be similar to the `numpy.matmul` function. - If both are 2-D, they are multiplied like conventional matrices.
The differences are: - If either is 3-D, it is treated as a stack of matrices residing in the
- Currently only rank 1 to rank 3 input tensors are supported. last two dimensions and a batched matrix multiply supporting broadcast
- We add `transpose_X` and `transpose_Y` flags. applies on the two tensors.
Both the input `X` and `Y` can carry the LoD (Level of Details) information, Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
or not. But the output only shares the LoD information with input `X`. nontransposed, the prepended or appended dimension :math:`1` will be
removed after matrix multipication.
Args: Args:
x (Variable): The input variable which is a Tensor or LoDTensor. x (Variable): The input variable which is a Tensor or LoDTensor.
y (Variable): If :attr:`num_or_sections` is an integer, y (Variable): The input variable which is a Tensor or LoDTensor.
then the integer indicates the number of equal sized sub-tensors transpose_x (bool): Whether to transpose :math:`x` before multiplication.
that the tensor will be divided into. If :attr:`num_or_sections` transpose_y (bool): Whether to transpose :math:`y` before multiplication.
is a list of integers, the length of list indicates the number of name(str|None): A name for this layer(optional). If set None, the layer
sub-tensors and the integers indicate the sizes of sub-tensors' will be named automatically.
:attr:`dim` dimension orderly.
dim (int): The dimension along which to split. If :math:`dim < 0`, the
dimension to split along is :math:`rank(input) + dim`.
Returns: Returns:
List: The list of segmented tensor variables. Variable: The product Tensor variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
# x is a Tensor variable with shape [3, 9, 5]: # Examples to clarify shapes of the inputs and output
x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1) # x: [B, M, K], y: [B, K, N]
x0.shape # [3, 3, 5] fluid.layers.matmul(x, y) # out: [B, M, N]
x1.shape # [3, 3, 5] # x: [B, M, K], y: [K, N]
x2.shape # [3, 3, 5] fluid.layers.matmul(x, y) # out: [B, M, N]
x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1) # x: [B, M, K], y: [K]
x0.shape # [3, 2, 5] fluid.layers.matmul(x, y) # out: [B, M]
x1.shape # [3, 3, 5] # x: [M, K], y: [K, N]
x2.shape # [3, 4, 5] fluid.layers.matmul(x, y) # out: [M, N]
# x: [K], y: [K]
fluid.layers.matmul(x, y) # out: [1]
# x: [M], y: [N]
fluid.layers.matmul(x, y, True, True) # out: [M, N]
""" """
helper = LayerHelper('split', **locals()) helper = LayerHelper('matmul', **locals())
input_shape = input.shape assert max(
dim = (len(input_shape) + dim) if dim < 0 else dim len(x.shape), len(y.shape)
if isinstance(num_or_sections, int): ) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.'
assert num_or_sections > 1, 'num_or_sections must be more than 1.' out = helper.create_tmp_variable(dtype=helper.input_dtype())
num = num_or_sections
else:
assert len(num_or_sections) < input_shape[
dim], 'len(num_or_sections) must not be more than input.shape[dim].'
num = len(num_or_sections)
outs = [
helper.create_tmp_variable(dtype=helper.input_dtype())
for i in range(num)
]
helper.append_op( helper.append_op(
type='split', type='matmul',
inputs={'X': input}, inputs={'X': x,
outputs={'Out': outs}, 'Y': y},
attrs={ outputs={'Out': out},
'num': num_or_sections if isinstance(num_or_sections, int) else 0, attrs={'transpose_X': transpose_x,
'sections': num_or_sections 'transpose_Y': transpose_y})
if isinstance(num_or_sections, list) else [], return out
'axis': dim
})
return outs
...@@ -4,6 +4,7 @@ __all__ = [ ...@@ -4,6 +4,7 @@ __all__ = [
"simple_img_conv_pool", "simple_img_conv_pool",
"sequence_conv_pool", "sequence_conv_pool",
"glu", "glu",
"",
] ]
...@@ -135,3 +136,55 @@ def glu(input, dim=-1): ...@@ -135,3 +136,55 @@ def glu(input, dim=-1):
a, b = layers.split(input, num_or_sections=2, dim=dim) a, b = layers.split(input, num_or_sections=2, dim=dim)
out = layers.elementwise_mul(x=a, y=b) out = layers.elementwise_mul(x=a, y=b)
return out return out
def dot_product_attention(querys, keys, values):
"""
The dot-product attention.
Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key.
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor.
value (Variable): The input variable which is a Tensor or LoDTensor.
Returns:
tuple: The Tensor variables representing the output and attention scores.
Examples:
.. code-block:: python
# Suppose q, k, v are tensor variables with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10]
attn_scores.shape # [3, 5, 6]
"""
assert keys.shape[-2] == values.shape[
-2], 'The shapes of keys and values mismatch.'
assert querys.shape[-1] == keys.shape[
-1], 'The shapes of querys and keys mismatch.'
product = layers.matmul(x=querys, y=keys, transpose_y=True)
attn_scores = layers.reshape(
x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act='softmax'),
shape=product.shape)
out = layers.matmul(attn_scores, values)
return out, attn_scores
...@@ -83,18 +83,18 @@ class Generator(object): ...@@ -83,18 +83,18 @@ class Generator(object):
self.outputs = {'Out': Out} self.outputs = {'Out': Out}
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-2) self.check_output(atol=1e-3)
def test_check_grad_normal(self): def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
def test_check_grad_ignore_x(self): def test_check_grad_ignore_x(self):
self.check_grad( self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
def test_check_grad_ignore_y(self): def test_check_grad_ignore_y(self):
self.check_grad( self.check_grad(
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
# Generate test cases for all possibilities # Generate test cases for all possibilities
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册