From 9bcb2d268e4cee2e9f67d806288d7c71182266b3 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 17 Jan 2018 17:25:00 +0800 Subject: [PATCH] Add python wrapper for matmul_op and dot_product_attention --- doc/api/v2/fluid/layers.rst | 6 + doc/api/v2/fluid/nets.rst | 6 + python/paddle/v2/fluid/layers/nn.py | 115 ++++++++---------- python/paddle/v2/fluid/nets.py | 53 ++++++++ .../paddle/v2/fluid/tests/test_matmul_op.py | 8 +- 5 files changed, 121 insertions(+), 67 deletions(-) diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 62c154e65dc..6e1d6719ace 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -364,6 +364,12 @@ split .. autofunction:: paddle.v2.fluid.layers.split :noindex: + +matmul +------ +.. autofunction:: paddle.v2.fluid.layers.matmul + :noindex: + logsigmoid ---------- .. autofunction:: paddle.v2.fluid.layers.logsigmoid diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst index cca0dcdf082..f6b1cb4ba10 100644 --- a/doc/api/v2/fluid/nets.rst +++ b/doc/api/v2/fluid/nets.rst @@ -25,3 +25,9 @@ glu .. autofunction:: paddle.v2.fluid.nets.glu :noindex: + +dot_product_attention +--------------------- +.. autofunction:: paddle.v2.fluid.nets.dot_product_attention + :noindex: + diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index bc373d4b37f..68499f1feba 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -37,6 +37,7 @@ __all__ = [ 'sequence_last_step', 'dropout', 'split', + 'matmul', ] @@ -1586,83 +1587,71 @@ def split(input, num_or_sections, dim=-1): return outs -def matmul(x, y): +def matmul(x, y, transpose_x=False, transpose_y=False, name=None): """ - Applies matrix multipication to two tensors. + Applies matrix multipication to two tensors. Currently only rank 1 to rank + 3 input tensors are supported. - This operator is used to perform (batched) matrix multiplication - over the last two dimensions of the input tensors `X` and `Y`. + The actual behavior depends on the shapes of :math:`x`, :math:`y` and the + flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically: - If a transpose flag is specified, the last two dimensions of the - tensor are transposed. If the tensor is rank-1 of shape [D], then - for `X` it is treated as [1, D] in nontransposed form and as [D, 1] - in transposed form, whereas for `Y` it is the opposite: It is treated - as [D, 1] in nontransposed form and as [1, D] in transposed form. + - If a transpose flag is specified, the last two dimensions of the tensor + are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for + :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as + :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the + opposite: It is treated as :math:`[D, 1]` in nontransposed form and as + :math:`[1, D]` in transposed form. - Examples without transpose: - - X: [K], Y: [K] => Out: [1] - - X: [K], Y: [K, N] => Out: [N] - - X: [B, M, K], Y: [K] => Out: [B, M] - - X: [M, K], Y: [B, K, N] => Out: [B, M, N] - - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] + - After transpose, the two tensors are 2-D or 3-D and matrix multipication + performs in the following way. - The behavior is designed to be similar to the `numpy.matmul` function. - The differences are: - - Currently only rank 1 to rank 3 input tensors are supported. - - We add `transpose_X` and `transpose_Y` flags. + - If both are 2-D, they are multiplied like conventional matrices. + - If either is 3-D, it is treated as a stack of matrices residing in the + last two dimensions and a batched matrix multiply supporting broadcast + applies on the two tensors. - Both the input `X` and `Y` can carry the LoD (Level of Details) information, - or not. But the output only shares the LoD information with input `X`. + Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and + nontransposed, the prepended or appended dimension :math:`1` will be + removed after matrix multipication. Args: x (Variable): The input variable which is a Tensor or LoDTensor. - y (Variable): If :attr:`num_or_sections` is an integer, - then the integer indicates the number of equal sized sub-tensors - that the tensor will be divided into. If :attr:`num_or_sections` - is a list of integers, the length of list indicates the number of - sub-tensors and the integers indicate the sizes of sub-tensors' - :attr:`dim` dimension orderly. - dim (int): The dimension along which to split. If :math:`dim < 0`, the - dimension to split along is :math:`rank(input) + dim`. + y (Variable): The input variable which is a Tensor or LoDTensor. + transpose_x (bool): Whether to transpose :math:`x` before multiplication. + transpose_y (bool): Whether to transpose :math:`y` before multiplication. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: - List: The list of segmented tensor variables. + Variable: The product Tensor variable. Examples: .. code-block:: python - # x is a Tensor variable with shape [3, 9, 5]: - x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1) - x0.shape # [3, 3, 5] - x1.shape # [3, 3, 5] - x2.shape # [3, 3, 5] - x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1) - x0.shape # [3, 2, 5] - x1.shape # [3, 3, 5] - x2.shape # [3, 4, 5] + # Examples to clarify shapes of the inputs and output + # x: [B, M, K], y: [B, K, N] + fluid.layers.matmul(x, y) # out: [B, M, N] + # x: [B, M, K], y: [K, N] + fluid.layers.matmul(x, y) # out: [B, M, N] + # x: [B, M, K], y: [K] + fluid.layers.matmul(x, y) # out: [B, M] + # x: [M, K], y: [K, N] + fluid.layers.matmul(x, y) # out: [M, N] + # x: [K], y: [K] + fluid.layers.matmul(x, y) # out: [1] + # x: [M], y: [N] + fluid.layers.matmul(x, y, True, True) # out: [M, N] """ - helper = LayerHelper('split', **locals()) - input_shape = input.shape - dim = (len(input_shape) + dim) if dim < 0 else dim - if isinstance(num_or_sections, int): - assert num_or_sections > 1, 'num_or_sections must be more than 1.' - num = num_or_sections - else: - assert len(num_or_sections) < input_shape[ - dim], 'len(num_or_sections) must not be more than input.shape[dim].' - num = len(num_or_sections) - outs = [ - helper.create_tmp_variable(dtype=helper.input_dtype()) - for i in range(num) - ] + helper = LayerHelper('matmul', **locals()) + assert max( + len(x.shape), len(y.shape) + ) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.' + out = helper.create_tmp_variable(dtype=helper.input_dtype()) helper.append_op( - type='split', - inputs={'X': input}, - outputs={'Out': outs}, - attrs={ - 'num': num_or_sections if isinstance(num_or_sections, int) else 0, - 'sections': num_or_sections - if isinstance(num_or_sections, list) else [], - 'axis': dim - }) - return outs + type='matmul', + inputs={'X': x, + 'Y': y}, + outputs={'Out': out}, + attrs={'transpose_X': transpose_x, + 'transpose_Y': transpose_y}) + return out diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index d515429216c..18bef45d66b 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -4,6 +4,7 @@ __all__ = [ "simple_img_conv_pool", "sequence_conv_pool", "glu", + "", ] @@ -135,3 +136,55 @@ def glu(input, dim=-1): a, b = layers.split(input, num_or_sections=2, dim=dim) out = layers.elementwise_mul(x=a, y=b) return out + + +def dot_product_attention(querys, keys, values): + """ + The dot-product attention. + + Attention mechanism can be seen as mapping a query and a set of key-value + pairs to an output. The output is computed as a weighted sum of the values, + where the weight assigned to each value is computed by a compatibility + function (dot-product here) of the query with the corresponding key. + + The dot-product attention can be implemented through (batch) matrix + multipication as follows: + + .. math:: + + Attention(Q, K, V)= softmax(QK^\mathrm{T})V + + Refer to `Attention Is All You Need + `_. + + Note that batch data containing sequences with different lengths is not + supported by this because of the (batch) matrix multipication. + + Args: + query (Variable): The input variable which is a Tensor or LoDTensor. + key (Variable): The input variable which is a Tensor or LoDTensor. + value (Variable): The input variable which is a Tensor or LoDTensor. + + Returns: + tuple: The Tensor variables representing the output and attention scores. + + Examples: + .. code-block:: python + + # Suppose q, k, v are tensor variables with the following shape: + # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] + out, attn_scores = fluid.nets.dot_product_attention(q, k, v) + out.shape # [3, 5, 10] + attn_scores.shape # [3, 5, 6] + """ + assert keys.shape[-2] == values.shape[ + -2], 'The shapes of keys and values mismatch.' + assert querys.shape[-1] == keys.shape[ + -1], 'The shapes of querys and keys mismatch.' + product = layers.matmul(x=querys, y=keys, transpose_y=True) + attn_scores = layers.reshape( + x=layers.reshape( + x=product, shape=[-1, product.shape[-1]], act='softmax'), + shape=product.shape) + out = layers.matmul(attn_scores, values) + return out, attn_scores diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py index d51572c8ab7..3ad714ad673 100644 --- a/python/paddle/v2/fluid/tests/test_matmul_op.py +++ b/python/paddle/v2/fluid/tests/test_matmul_op.py @@ -83,18 +83,18 @@ class Generator(object): self.outputs = {'Out': Out} def test_check_output(self): - self.check_output(atol=1e-2) + self.check_output(atol=1e-3) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3) def test_check_grad_ignore_x(self): self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X")) def test_check_grad_ignore_y(self): self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y')) # Generate test cases for all possibilities -- GitLab