nets.py 11.5 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
import layers
F
fengjiayi 已提交
15

16 17 18
__all__ = [
    "simple_img_conv_pool",
    "sequence_conv_pool",
19
    "glu",
20
    "scaled_dot_product_attention",
21
]
D
dzhwinter 已提交
22

F
fengjiayi 已提交
23 24 25

def simple_img_conv_pool(input,
                         num_filters,
D
dzhwinter 已提交
26
                         filter_size,
F
fengjiayi 已提交
27 28 29
                         pool_size,
                         pool_stride,
                         act,
F
fengjiayi 已提交
30
                         param_attr=None,
C
chengduoZH 已提交
31
                         pool_type='max',
32 33
                         use_cudnn=True,
                         use_mkldnn=False):
F
fengjiayi 已提交
34 35 36 37
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
38
        param_attr=param_attr,
C
chengduoZH 已提交
39
        act=act,
40 41
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
F
fengjiayi 已提交
42 43 44 45

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
Q
Qiao Longfei 已提交
46
        pool_type=pool_type,
C
chengduoZH 已提交
47 48
        pool_stride=pool_stride,
        use_cudnn=use_cudnn)
Q
Qiao Longfei 已提交
49 50 51 52 53 54 55 56 57
    return pool_out


def img_conv_group(input,
                   conv_num_filter,
                   pool_size,
                   conv_padding=1,
                   conv_filter_size=3,
                   conv_act=None,
F
fengjiayi 已提交
58
                   param_attr=None,
Q
Qiao Longfei 已提交
59
                   conv_with_batchnorm=False,
W
wanghaoshuang 已提交
60
                   conv_batchnorm_drop_rate=0.0,
Q
Qiao Longfei 已提交
61
                   pool_stride=1,
C
chengduoZH 已提交
62
                   pool_type=None,
63 64
                   use_cudnn=True,
                   use_mkldnn=False):
Q
Qiao Longfei 已提交
65 66 67 68 69
    """
    Image Convolution Group, Used for vgg net.
    """
    tmp = input
    assert isinstance(conv_num_filter, list) or \
70
        isinstance(conv_num_filter, tuple)
Q
Qiao Longfei 已提交
71 72 73 74 75 76 77 78 79

    def __extend_list__(obj):
        if not hasattr(obj, '__len__'):
            return [obj] * len(conv_num_filter)
        else:
            return obj

    conv_padding = __extend_list__(conv_padding)
    conv_filter_size = __extend_list__(conv_filter_size)
F
fengjiayi 已提交
80
    param_attr = __extend_list__(param_attr)
Q
Qiao Longfei 已提交
81 82 83 84 85 86 87 88 89 90 91 92 93
    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)

    for i in xrange(len(conv_num_filter)):
        local_conv_act = conv_act
        if conv_with_batchnorm[i]:
            local_conv_act = None

        tmp = layers.conv2d(
            input=tmp,
            num_filters=conv_num_filter[i],
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
F
fengjiayi 已提交
94
            param_attr=param_attr[i],
C
chengduoZH 已提交
95
            act=local_conv_act,
96 97
            use_cudnn=use_cudnn,
            use_mkldnn=use_mkldnn)
Q
Qiao Longfei 已提交
98 99

        if conv_with_batchnorm[i]:
100
            tmp = layers.batch_norm(input=tmp, act=conv_act)
Q
Qiao Longfei 已提交
101 102
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
103
                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
Q
Qiao Longfei 已提交
104 105 106 107 108

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
C
chengduoZH 已提交
109
        pool_stride=pool_stride,
C
chengduoZH 已提交
110
        use_cudnn=use_cudnn)
F
fengjiayi 已提交
111
    return pool_out
D
dzhwinter 已提交
112 113 114 115 116


def sequence_conv_pool(input,
                       num_filters,
                       filter_size,
F
fengjiayi 已提交
117
                       param_attr=None,
118
                       act="sigmoid",
119
                       pool_type="max"):
D
dzhwinter 已提交
120 121 122 123
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
124
        param_attr=param_attr,
125
        act=act)
D
dzhwinter 已提交
126

127
    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
D
dzhwinter 已提交
128
    return pool_out
G
guosheng 已提交
129 130 131 132


def glu(input, dim=-1):
    """
Y
ying 已提交
133 134 135
    The gated linear unit composed by split, sigmoid activation and elementwise
    multiplication. Specifically, Split the input into two equal sized parts
    :math:`a` and :math:`b` along the given dimension and then compute as
G
guosheng 已提交
136
    following:
G
guosheng 已提交
137 138 139 140 141

        .. math::

            {GLU}(a, b)= a \otimes \sigma(b)

Y
ying 已提交
142
    Refer to `Language Modeling with Gated Convolutional Networks
G
guosheng 已提交
143
    <https://arxiv.org/pdf/1612.08083.pdf>`_.
Y
ying 已提交
144

G
guosheng 已提交
145 146
    Args:
        input (Variable): The input variable which is a Tensor or LoDTensor.
Y
ying 已提交
147
        dim (int): The dimension along which to split. If :math:`dim < 0`, the
G
guosheng 已提交
148 149 150 151 152 153 154 155 156
            dimension to split along is :math:`rank(input) + dim`.

    Returns:
        Variable: The Tensor variable with half the size of input.

    Examples:
        .. code-block:: python

            # x is a Tensor variable with shape [3, 6, 9]
157
            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
G
guosheng 已提交
158 159 160
    """

    a, b = layers.split(input, num_or_sections=2, dim=dim)
G
guosheng 已提交
161 162
    act_b = layers.sigmoid(x=b)
    out = layers.elementwise_mul(x=a, y=act_b)
G
guosheng 已提交
163
    return out
164 165


Y
ying 已提交
166 167 168
def scaled_dot_product_attention(queries,
                                 keys,
                                 values,
Y
ying 已提交
169
                                 num_heads=1,
Y
ying 已提交
170
                                 dropout_rate=0.):
171 172 173
    """
    The dot-product attention.

174 175 176
    Attention mechanism can be seen as mapping a query and a set of key-value
    pairs to an output. The output is computed as a weighted sum of the values,
    where the weight assigned to each value is computed by a compatibility
177
    function (dot-product here) of the query with the corresponding key.
Y
ying 已提交
178 179

    The dot-product attention can be implemented through (batch) matrix
180 181 182 183
    multipication as follows:

        .. math::

184
            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
185

Y
ying 已提交
186
    Refer to `Attention Is All You Need
187 188
    <https://arxiv.org/pdf/1706.03762.pdf>`_.

Y
ying 已提交
189
    Args:
190

Y
ying 已提交
191 192 193 194 195 196 197
        queries (Variable): The input variable which should be a 3-D Tensor.
        keys (Variable): The input variable which should be a 3-D Tensor.
        values (Variable): The input variable which should be a 3-D Tensor.
        num_heads (int): Head number to compute the scaled dot product
                         attention. Default value is 1.
        dropout_rate (float): The dropout rate to drop the attention weight.
                              Default value is 0.
198 199

    Returns:
Y
ying 已提交
200

G
guosheng 已提交
201
        Variable: A 3-D Tensor computed by multi-head scaled dot product \
202
                  attention.
203

Y
ying 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216
    Raises:

        ValueError: If input queries, keys, values are not 3-D Tensors.

    NOTE:
        1. When num_heads > 1, three linear projections are learned respectively
        to map input queries, keys and values into queries', keys' and values'.
        queries', keys' and values' have the same shapes with queries, keys
        and values.

        1. When num_heads == 1, scaled_dot_product_attention has no learnable
        parameters.

217 218 219
    Examples:
        .. code-block:: python

Y
ying 已提交
220 221 222
            # Suppose q, k, v are Tensors with the following shape:
            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]

Y
ying 已提交
223 224
            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
            contexts.shape  # [3, 5, 10]
225
    """
Y
ying 已提交
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs quries, keys and values should all be 3-D tensors.")

    if queries.shape[-1] != keys.shape[-1]:
        raise ValueError(
            "The hidden size of queries and keys should be the same.")
    if keys.shape[-2] != values.shape[-2]:
        raise ValueError(
            "The max sequence length in query batch and in key batch "
            "should be the same.")
    if keys.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of keys (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (keys.shape[-1], num_heads))
    if values.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of values (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (values.shape[-1], num_heads))

Y
ying 已提交
246
    def __compute_qkv(queries, keys, values, num_heads):
Y
ying 已提交
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
        """
        Add linear projection to queries, keys, and values.

        Args:
            queries(Tensor): a 3-D input Tensor.
            keys(Tensor): a 3-D input Tensor.
            values(Tensor): a 3-D input Tensor.
            num_heads(int): The number of heads. Linearly project the inputs
                            ONLY when num_heads > 1.

        Returns:
            Tensor: linearly projected output Tensors: queries', keys' and
                    values'. They have the same shapes with queries, keys and
                    values.
        """

Y
ying 已提交
263 264 265 266 267 268 269 270
        if num_heads == 1:
            return queries, keys, values

        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
        return q, k, v

Y
ying 已提交
271 272 273 274 275 276
    def __split_heads(x, num_heads):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions.

        Args:
Y
ying 已提交
277 278
            x(Tensor): a 3-D input Tensor.
            num_heads(int): The number of heads.
Y
ying 已提交
279 280

        Returns:
Y
ying 已提交
281 282
            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
                    of the last dimension of x.
Y
ying 已提交
283
        """
Y
ying 已提交
284 285
        if num_heads == 1:
            return x
286

Y
ying 已提交
287
        hidden_size = x.shape[-1]
288 289 290
        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
        # into a 4-D output:
        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
Y
ying 已提交
291
        reshaped = layers.reshape(
292 293
            x=x,
            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
294 295

        # permuate the dimensions into:
296 297 298 299
        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
Y
ying 已提交
300 301 302 303 304 305 306 307 308 309 310 311 312
        """
        Reshape the last two dimensions of inpunt tensor x so that it becomes
        one dimension.

        Args:
            x(Tensor): a 4-D input Tensor with shape
                       [bs, num_heads, max_sequence_length, hidden_dim].

        Returns:
            Tensor: a Tensor with shape
                    [bs, max_sequence_length, num_heads * hidden_dim].
        """

Y
ying 已提交
313
        if len(x.shape) == 3: return x
314 315 316
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

Y
ying 已提交
317
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
Y
ying 已提交
318
        return layers.reshape(
319
            x=trans_x,
Y
ying 已提交
320 321 322 323
            shape=map(int, [
                trans_x.shape[0], trans_x.shape[1],
                trans_x.shape[2] * trans_x.shape[3]
            ]))
324

Y
ying 已提交
325 326 327 328 329
    q, k, v = __compute_qkv(queries, keys, values, num_heads)

    q = __split_heads(q, num_heads)
    k = __split_heads(k, num_heads)
    v = __split_heads(v, num_heads)
Y
ying 已提交
330 331

    key_dim_per_head = keys.shape[-1] // num_heads
332 333
    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
Y
ying 已提交
334

Y
ying 已提交
335
    weights = layers.reshape(
336
        x=layers.reshape(
Y
ying 已提交
337
            x=product, shape=[-1, product.shape[-1]], act="softmax"),
338
        shape=product.shape)
Y
ying 已提交
339
    if dropout_rate:
G
guosheng 已提交
340 341
        weights = layers.dropout(
            weights, dropout_prob=dropout_rate, is_test=False)
Y
ying 已提交
342 343
    ctx_multiheads = layers.matmul(weights, v)
    return __combine_heads(ctx_multiheads)