diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 62c154e65dcff1bdfb00109bf1b724c34731652e..6e1d6719ace94743da76ca34304d481ad8688483 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -364,6 +364,12 @@ split .. autofunction:: paddle.v2.fluid.layers.split :noindex: + +matmul +------ +.. autofunction:: paddle.v2.fluid.layers.matmul + :noindex: + logsigmoid ---------- .. autofunction:: paddle.v2.fluid.layers.logsigmoid diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst index cca0dcdf082c0d809fab1aebba2c0b6c7b8efa2a..f6b1cb4ba10659fb336899f08376c265c67290f1 100644 --- a/doc/api/v2/fluid/nets.rst +++ b/doc/api/v2/fluid/nets.rst @@ -25,3 +25,9 @@ glu .. autofunction:: paddle.v2.fluid.nets.glu :noindex: + +dot_product_attention +--------------------- +.. autofunction:: paddle.v2.fluid.nets.dot_product_attention + :noindex: + diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 4e8fd407c9983e2827d3137fa4f49a8425d5dce2..808f25d6bec8eefcbd5a7d96eb2c857cf35c4f73 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -50,6 +50,7 @@ __all__ = [ 'sequence_last_step', 'dropout', 'split', + 'matmul', ] @@ -1597,3 +1598,73 @@ def split(input, num_or_sections, dim=-1): 'axis': dim }) return outs + + +def matmul(x, y, transpose_x=False, transpose_y=False, name=None): + """ + Applies matrix multipication to two tensors. Currently only rank 1 to rank + 3 input tensors are supported. + + The actual behavior depends on the shapes of :math:`x`, :math:`y` and the + flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically: + + - If a transpose flag is specified, the last two dimensions of the tensor + are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for + :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as + :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the + opposite: It is treated as :math:`[D, 1]` in nontransposed form and as + :math:`[1, D]` in transposed form. + + - After transpose, the two tensors are 2-D or 3-D and matrix multipication + performs in the following way. + + - If both are 2-D, they are multiplied like conventional matrices. + - If either is 3-D, it is treated as a stack of matrices residing in the + last two dimensions and a batched matrix multiply supporting broadcast + applies on the two tensors. + + Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and + nontransposed, the prepended or appended dimension :math:`1` will be + removed after matrix multipication. + + Args: + x (Variable): The input variable which is a Tensor or LoDTensor. + y (Variable): The input variable which is a Tensor or LoDTensor. + transpose_x (bool): Whether to transpose :math:`x` before multiplication. + transpose_y (bool): Whether to transpose :math:`y` before multiplication. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The product Tensor variable. + + Examples: + .. code-block:: python + + # Examples to clarify shapes of the inputs and output + # x: [B, M, K], y: [B, K, N] + fluid.layers.matmul(x, y) # out: [B, M, N] + # x: [B, M, K], y: [K, N] + fluid.layers.matmul(x, y) # out: [B, M, N] + # x: [B, M, K], y: [K] + fluid.layers.matmul(x, y) # out: [B, M] + # x: [M, K], y: [K, N] + fluid.layers.matmul(x, y) # out: [M, N] + # x: [K], y: [K] + fluid.layers.matmul(x, y) # out: [1] + # x: [M], y: [N] + fluid.layers.matmul(x, y, True, True) # out: [M, N] + """ + helper = LayerHelper('matmul', **locals()) + assert max( + len(x.shape), len(y.shape) + ) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.' + out = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='matmul', + inputs={'X': x, + 'Y': y}, + outputs={'Out': out}, + attrs={'transpose_X': transpose_x, + 'transpose_Y': transpose_y}) + return out diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index b5c26e713db4e5c48d6c0851fd68275aeabeea64..ee6f70b89940a1daa076c5617b05e5a845078d14 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -17,6 +17,7 @@ __all__ = [ "simple_img_conv_pool", "sequence_conv_pool", "glu", + "dot_product_attention", ] @@ -150,3 +151,55 @@ def glu(input, dim=-1): act_b = layers.sigmoid(x=b) out = layers.elementwise_mul(x=a, y=act_b) return out + + +def dot_product_attention(querys, keys, values): + """ + The dot-product attention. + + Attention mechanism can be seen as mapping a query and a set of key-value + pairs to an output. The output is computed as a weighted sum of the values, + where the weight assigned to each value is computed by a compatibility + function (dot-product here) of the query with the corresponding key. + + The dot-product attention can be implemented through (batch) matrix + multipication as follows: + + .. math:: + + Attention(Q, K, V)= softmax(QK^\mathrm{T})V + + Refer to `Attention Is All You Need + `_. + + Note that batch data containing sequences with different lengths is not + supported by this because of the (batch) matrix multipication. + + Args: + query (Variable): The input variable which is a Tensor or LoDTensor. + key (Variable): The input variable which is a Tensor or LoDTensor. + value (Variable): The input variable which is a Tensor or LoDTensor. + + Returns: + tuple: The Tensor variables representing the output and attention scores. + + Examples: + .. code-block:: python + + # Suppose q, k, v are tensor variables with the following shape: + # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] + out, attn_scores = fluid.nets.dot_product_attention(q, k, v) + out.shape # [3, 5, 10] + attn_scores.shape # [3, 5, 6] + """ + assert keys.shape[-2] == values.shape[ + -2], 'The shapes of keys and values mismatch.' + assert querys.shape[-1] == keys.shape[ + -1], 'The shapes of querys and keys mismatch.' + product = layers.matmul(x=querys, y=keys, transpose_y=True) + attn_scores = layers.reshape( + x=layers.reshape( + x=product, shape=[-1, product.shape[-1]], act='softmax'), + shape=product.shape) + out = layers.matmul(attn_scores, values) + return out, attn_scores diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py index 0220cfe1287ab6908f379f9f7451cd257dd7f0a6..f7dc4e053217dcceaf9c64e3605286fc0698593b 100644 --- a/python/paddle/v2/fluid/tests/test_matmul_op.py +++ b/python/paddle/v2/fluid/tests/test_matmul_op.py @@ -96,18 +96,18 @@ class Generator(object): self.outputs = {'Out': Out} def test_check_output(self): - self.check_output(atol=1e-2) + self.check_output(atol=1e-3) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3) def test_check_grad_ignore_x(self): self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X")) def test_check_grad_ignore_y(self): self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y')) # Generate test cases for all possibilities