# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import layers __all__ = [ "simple_img_conv_pool", "sequence_conv_pool", "glu", "scaled_dot_product_attention", ] def simple_img_conv_pool(input, num_filters, filter_size, pool_size, pool_stride, act, param_attr=None, pool_type='max', use_cudnn=True): conv_out = layers.conv2d( input=input, num_filters=num_filters, filter_size=filter_size, param_attr=param_attr, act=act, use_cudnn=use_cudnn) pool_out = layers.pool2d( input=conv_out, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, use_cudnn=use_cudnn) return pool_out def img_conv_group(input, conv_num_filter, pool_size, conv_padding=1, conv_filter_size=3, conv_act=None, param_attr=None, conv_with_batchnorm=False, conv_batchnorm_drop_rate=0.0, pool_stride=1, pool_type=None, use_cudnn=True): """ Image Convolution Group, Used for vgg net. """ tmp = input assert isinstance(conv_num_filter, list) or \ isinstance(conv_num_filter, tuple) def __extend_list__(obj): if not hasattr(obj, '__len__'): return [obj] * len(conv_num_filter) else: return obj conv_padding = __extend_list__(conv_padding) conv_filter_size = __extend_list__(conv_filter_size) param_attr = __extend_list__(param_attr) conv_with_batchnorm = __extend_list__(conv_with_batchnorm) conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate) for i in xrange(len(conv_num_filter)): local_conv_act = conv_act if conv_with_batchnorm[i]: local_conv_act = None tmp = layers.conv2d( input=tmp, num_filters=conv_num_filter[i], filter_size=conv_filter_size[i], padding=conv_padding[i], param_attr=param_attr[i], act=local_conv_act, use_cudnn=use_cudnn) if conv_with_batchnorm[i]: tmp = layers.batch_norm(input=tmp, act=conv_act) drop_rate = conv_batchnorm_drop_rate[i] if abs(drop_rate) > 1e-5: tmp = layers.dropout(x=tmp, dropout_prob=drop_rate) pool_out = layers.pool2d( input=tmp, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, use_cudnn=use_cudnn) return pool_out def sequence_conv_pool(input, num_filters, filter_size, param_attr=None, act="sigmoid", pool_type="max"): conv_out = layers.sequence_conv( input=input, num_filters=num_filters, filter_size=filter_size, param_attr=param_attr, act=act) pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type) return pool_out def glu(input, dim=-1): """ The gated linear unit composed by split, sigmoid activation and elementwise multiplication. Specifically, Split the input into two equal sized parts :math:`a` and :math:`b` along the given dimension and then compute as following: .. math:: {GLU}(a, b)= a \otimes \sigma(b) Refer to `Language Modeling with Gated Convolutional Networks `_. Args: input (Variable): The input variable which is a Tensor or LoDTensor. dim (int): The dimension along which to split. If :math:`dim < 0`, the dimension to split along is :math:`rank(input) + dim`. Returns: Variable: The Tensor variable with half the size of input. Examples: .. code-block:: python # x is a Tensor variable with shape [3, 6, 9] fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9] """ a, b = layers.split(input, num_or_sections=2, dim=dim) act_b = layers.sigmoid(x=b) out = layers.elementwise_mul(x=a, y=act_b) return out def scaled_dot_product_attention(queries, keys, values, num_heads=1, dropout_rate=0.): """ The dot-product attention. Attention mechanism can be seen as mapping a query and a set of key-value pairs to an output. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function (dot-product here) of the query with the corresponding key. The dot-product attention can be implemented through (batch) matrix multipication as follows: .. math:: Attention(Q, K, V)= softmax(QK^\mathrm{T})V Refer to `Attention Is All You Need `_. Args: queries (Variable): The input variable which should be a 3-D Tensor. keys (Variable): The input variable which should be a 3-D Tensor. values (Variable): The input variable which should be a 3-D Tensor. num_heads (int): Head number to compute the scaled dot product attention. Default value is 1. dropout_rate (float): The dropout rate to drop the attention weight. Default value is 0. Returns: Variable: A 3-D Tensor computed by multi-head scaled dot product \ attention. Raises: ValueError: If input queries, keys, values are not 3-D Tensors. NOTE: 1. When num_heads > 1, three linear projections are learned respectively to map input queries, keys and values into queries', keys' and values'. queries', keys' and values' have the same shapes with queries, keys and values. 1. When num_heads == 1, scaled_dot_product_attention has no learnable parameters. Examples: .. code-block:: python # Suppose q, k, v are Tensors with the following shape: # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] contexts = fluid.nets.scaled_dot_product_attention(q, k, v) contexts.shape # [3, 5, 10] """ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): raise ValueError( "Inputs quries, keys and values should all be 3-D tensors.") if queries.shape[-1] != keys.shape[-1]: raise ValueError( "The hidden size of queries and keys should be the same.") if keys.shape[-2] != values.shape[-2]: raise ValueError( "The max sequence length in query batch and in key batch " "should be the same.") if keys.shape[-1] % num_heads != 0: raise ValueError("The hidden size of keys (%d) must be divisible " "by the number of attention heads (%d)." % (keys.shape[-1], num_heads)) if values.shape[-1] % num_heads != 0: raise ValueError("The hidden size of values (%d) must be divisible " "by the number of attention heads (%d)." % (values.shape[-1], num_heads)) def __compute_qkv(queries, keys, values, num_heads): """ Add linear projection to queries, keys, and values. Args: queries(Tensor): a 3-D input Tensor. keys(Tensor): a 3-D input Tensor. values(Tensor): a 3-D input Tensor. num_heads(int): The number of heads. Linearly project the inputs ONLY when num_heads > 1. Returns: Tensor: linearly projected output Tensors: queries', keys' and values'. They have the same shapes with queries, keys and values. """ if num_heads == 1: return queries, keys, values q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2) k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2) v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2) return q, k, v def __split_heads(x, num_heads): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions. Args: x(Tensor): a 3-D input Tensor. num_heads(int): The number of heads. Returns: Tensor: a Tensor with shape [..., n, m/num_heads], where m is size of the last dimension of x. """ if num_heads == 1: return x hidden_size = x.shape[-1] # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim] # into a 4-D output: # [batch_size, max_sequence_length, num_heads, hidden_size_per_head]. reshaped = layers.reshape( x=x, shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads]) # permuate the dimensions into: # [batch_size, num_heads, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ Reshape the last two dimensions of inpunt tensor x so that it becomes one dimension. Args: x(Tensor): a 4-D input Tensor with shape [bs, num_heads, max_sequence_length, hidden_dim]. Returns: Tensor: a Tensor with shape [bs, max_sequence_length, num_heads * hidden_dim]. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) return layers.reshape( x=trans_x, shape=map(int, [ trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] * trans_x.shape[3] ])) q, k, v = __compute_qkv(queries, keys, values, num_heads) q = __split_heads(q, num_heads) k = __split_heads(k, num_heads) v = __split_heads(v, num_heads) key_dim_per_head = keys.shape[-1] // num_heads scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5) product = layers.matmul(x=k, y=scaled_q, transpose_y=True) weights = layers.reshape( x=layers.reshape( x=product, shape=[-1, product.shape[-1]], act="softmax"), shape=product.shape) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False) ctx_multiheads = layers.matmul(weights, v) return __combine_heads(ctx_multiheads)