diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 3181a04e9dae385863218806f892574e514e720e..19fee8ec67b7a714ef9da9031e30616eada881c0 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -108,16 +108,17 @@ def fc(input, into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input tensor is flattened: the first `num_flatten_dims` - dimensions will be flatten to form the first - dimension of the final matrix (height of the - matrix), and the rest `rank(X) - num_flatten_dims` - dimensions are flattened to form the second - dimension of the final matrix (width of the matrix). - For example, suppose `X` is a 6-dimensional tensor - with a shape [2, 3, 4, 5, 6], and - `num_flatten_dims` = 3. Then, the flattened matrix - will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. - By default, `num_flatten_dims` is set to 1. + (inclusive, index starts from 1) dimensions will + be flatten to form the first dimension of the + final matrix (height of the matrix), and the rest + `rank(X) - num_flatten_dims` dimensions are + flattened to form the second dimension of the + final matrix (width of the matrix). For example, + suppose `X` is a 6-dimensional tensor with a shape + [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, + the flattened matrix will have a shape + [2 x 3 x 4, 5 x 6] = [24, 30]. By default, + `num_flatten_dims` is set to 1. param_attr(ParamAttr|list): The parameter attribute for learnable parameters/weights of the fully connected layer. @@ -158,6 +159,7 @@ def fc(input, param_shape = [ reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) ] + [size] + w = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) tmp = helper.create_tmp_variable(dtype) @@ -747,7 +749,7 @@ def square_error_cost(input, label, **kwargs): This layer accepts input predictions and target label and returns the squared error cost. - + For predictions, :math:`X`, and target labels, :math:`Y`, the equation is: .. math:: diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index bfc034a1113f1dea21be05064777e356852be47d..e18852c59662539f90778e74f2e11643b9a6fe8e 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -197,15 +197,27 @@ def scaled_dot_product_attention(queries, Variable: A 3-D Tensor computed by multi-head scaled dot product attention. + Raises: + + ValueError: If input queries, keys, values are not 3-D Tensors. + + NOTE: + 1. When num_heads > 1, three linear projections are learned respectively + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + + 1. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. + Examples: .. code-block:: python # Suppose q, k, v are Tensors with the following shape: # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] - contexts = fluid.nets.dot_product_attention(q, k, v) - out.shape # [3, 5, 10] - attn_scores.shape # [3, 5, 6] + contexts = fluid.nets.scaled_dot_product_attention(q, k, v) + contexts.shape # [3, 5, 10] """ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): raise ValueError( @@ -228,6 +240,22 @@ def scaled_dot_product_attention(queries, (values.shape[-1], num_heads)) def __compute_qkv(queries, keys, values, num_heads): + """ + Add linear projection to queries, keys, and values. + + Args: + queries(Tensor): a 3-D input Tensor. + keys(Tensor): a 3-D input Tensor. + values(Tensor): a 3-D input Tensor. + num_heads(int): The number of heads. Linearly project the inputs + ONLY when num_heads > 1. + + Returns: + Tensor: linearly projected output Tensors: queries', keys' and + values'. They have the same shapes with queries, keys and + values. + """ + if num_heads == 1: return queries, keys, values diff --git a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py old mode 100755 new mode 100644 diff --git a/python/paddle/v2/fluid/tests/test_multihead_attention.py b/python/paddle/v2/fluid/tests/test_multihead_attention.py index d52599004271a2bc234e50f9c5244140f7cd2086..54ec3e3d6e53f35d6a518ef659853e1a13c1711f 100644 --- a/python/paddle/v2/fluid/tests/test_multihead_attention.py +++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py @@ -65,6 +65,7 @@ class TestMultiheadAttention(unittest.TestCase): self.set_inputs(place) exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) output = exe.run(fluid.default_main_program(), feed=self.inputs, fetch_list=self.fetch_list, @@ -90,6 +91,8 @@ class TestMultiheadAttention(unittest.TestCase): self.set_program() self.run_program() + #fixme(caoying) add more meaningfull unittest. + if __name__ == '__main__': unittest.main()