nets.py 11.6 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
import layers
F
fengjiayi 已提交
15

16 17 18
__all__ = [
    "simple_img_conv_pool",
    "sequence_conv_pool",
19
    "glu",
20
    "scaled_dot_product_attention",
21
]
D
dzhwinter 已提交
22

F
fengjiayi 已提交
23 24 25

def simple_img_conv_pool(input,
                         num_filters,
D
dzhwinter 已提交
26
                         filter_size,
F
fengjiayi 已提交
27 28 29
                         pool_size,
                         pool_stride,
                         act,
F
fengjiayi 已提交
30
                         param_attr=None,
C
chengduoZH 已提交
31
                         pool_type='max',
32 33
                         use_cudnn=True,
                         use_mkldnn=False):
F
fengjiayi 已提交
34 35 36 37
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
38
        param_attr=param_attr,
C
chengduoZH 已提交
39
        act=act,
40 41
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
F
fengjiayi 已提交
42 43 44 45

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
Q
Qiao Longfei 已提交
46
        pool_type=pool_type,
C
chengduoZH 已提交
47
        pool_stride=pool_stride,
48 49
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
Q
Qiao Longfei 已提交
50 51 52 53 54 55 56 57 58
    return pool_out


def img_conv_group(input,
                   conv_num_filter,
                   pool_size,
                   conv_padding=1,
                   conv_filter_size=3,
                   conv_act=None,
F
fengjiayi 已提交
59
                   param_attr=None,
Q
Qiao Longfei 已提交
60
                   conv_with_batchnorm=False,
W
wanghaoshuang 已提交
61
                   conv_batchnorm_drop_rate=0.0,
Q
Qiao Longfei 已提交
62
                   pool_stride=1,
C
chengduoZH 已提交
63
                   pool_type=None,
64 65
                   use_cudnn=True,
                   use_mkldnn=False):
Q
Qiao Longfei 已提交
66 67 68 69 70
    """
    Image Convolution Group, Used for vgg net.
    """
    tmp = input
    assert isinstance(conv_num_filter, list) or \
71
        isinstance(conv_num_filter, tuple)
Q
Qiao Longfei 已提交
72 73 74 75 76 77 78 79 80

    def __extend_list__(obj):
        if not hasattr(obj, '__len__'):
            return [obj] * len(conv_num_filter)
        else:
            return obj

    conv_padding = __extend_list__(conv_padding)
    conv_filter_size = __extend_list__(conv_filter_size)
F
fengjiayi 已提交
81
    param_attr = __extend_list__(param_attr)
Q
Qiao Longfei 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94
    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)

    for i in xrange(len(conv_num_filter)):
        local_conv_act = conv_act
        if conv_with_batchnorm[i]:
            local_conv_act = None

        tmp = layers.conv2d(
            input=tmp,
            num_filters=conv_num_filter[i],
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
F
fengjiayi 已提交
95
            param_attr=param_attr[i],
C
chengduoZH 已提交
96
            act=local_conv_act,
97 98
            use_cudnn=use_cudnn,
            use_mkldnn=use_mkldnn)
Q
Qiao Longfei 已提交
99 100

        if conv_with_batchnorm[i]:
101
            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
Q
Qiao Longfei 已提交
102 103
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
104
                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
Q
Qiao Longfei 已提交
105 106 107 108 109

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
C
chengduoZH 已提交
110
        pool_stride=pool_stride,
111 112
        use_cudnn=use_cudnn,
        use_mkldnn=use_mkldnn)
F
fengjiayi 已提交
113
    return pool_out
D
dzhwinter 已提交
114 115 116 117 118


def sequence_conv_pool(input,
                       num_filters,
                       filter_size,
F
fengjiayi 已提交
119
                       param_attr=None,
120
                       act="sigmoid",
121
                       pool_type="max"):
D
dzhwinter 已提交
122 123 124 125
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
126
        param_attr=param_attr,
127
        act=act)
D
dzhwinter 已提交
128

129
    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
D
dzhwinter 已提交
130
    return pool_out
G
guosheng 已提交
131 132 133 134


def glu(input, dim=-1):
    """
Y
ying 已提交
135 136 137
    The gated linear unit composed by split, sigmoid activation and elementwise
    multiplication. Specifically, Split the input into two equal sized parts
    :math:`a` and :math:`b` along the given dimension and then compute as
G
guosheng 已提交
138
    following:
G
guosheng 已提交
139 140 141 142 143

        .. math::

            {GLU}(a, b)= a \otimes \sigma(b)

Y
ying 已提交
144
    Refer to `Language Modeling with Gated Convolutional Networks
G
guosheng 已提交
145
    <https://arxiv.org/pdf/1612.08083.pdf>`_.
Y
ying 已提交
146

G
guosheng 已提交
147 148
    Args:
        input (Variable): The input variable which is a Tensor or LoDTensor.
Y
ying 已提交
149
        dim (int): The dimension along which to split. If :math:`dim < 0`, the
G
guosheng 已提交
150 151 152 153 154 155 156 157 158
            dimension to split along is :math:`rank(input) + dim`.

    Returns:
        Variable: The Tensor variable with half the size of input.

    Examples:
        .. code-block:: python

            # x is a Tensor variable with shape [3, 6, 9]
159
            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
G
guosheng 已提交
160 161 162
    """

    a, b = layers.split(input, num_or_sections=2, dim=dim)
G
guosheng 已提交
163 164
    act_b = layers.sigmoid(x=b)
    out = layers.elementwise_mul(x=a, y=act_b)
G
guosheng 已提交
165
    return out
166 167


Y
ying 已提交
168 169 170
def scaled_dot_product_attention(queries,
                                 keys,
                                 values,
Y
ying 已提交
171
                                 num_heads=1,
Y
ying 已提交
172
                                 dropout_rate=0.):
173 174 175
    """
    The dot-product attention.

176 177 178
    Attention mechanism can be seen as mapping a query and a set of key-value
    pairs to an output. The output is computed as a weighted sum of the values,
    where the weight assigned to each value is computed by a compatibility
179
    function (dot-product here) of the query with the corresponding key.
Y
ying 已提交
180 181

    The dot-product attention can be implemented through (batch) matrix
182 183 184 185
    multipication as follows:

        .. math::

186
            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
187

Y
ying 已提交
188
    Refer to `Attention Is All You Need
189 190
    <https://arxiv.org/pdf/1706.03762.pdf>`_.

Y
ying 已提交
191
    Args:
192

Y
ying 已提交
193 194 195 196 197 198 199
        queries (Variable): The input variable which should be a 3-D Tensor.
        keys (Variable): The input variable which should be a 3-D Tensor.
        values (Variable): The input variable which should be a 3-D Tensor.
        num_heads (int): Head number to compute the scaled dot product
                         attention. Default value is 1.
        dropout_rate (float): The dropout rate to drop the attention weight.
                              Default value is 0.
200 201

    Returns:
Y
ying 已提交
202

G
guosheng 已提交
203
        Variable: A 3-D Tensor computed by multi-head scaled dot product \
204
                  attention.
205

Y
ying 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218
    Raises:

        ValueError: If input queries, keys, values are not 3-D Tensors.

    NOTE:
        1. When num_heads > 1, three linear projections are learned respectively
        to map input queries, keys and values into queries', keys' and values'.
        queries', keys' and values' have the same shapes with queries, keys
        and values.

        1. When num_heads == 1, scaled_dot_product_attention has no learnable
        parameters.

219 220 221
    Examples:
        .. code-block:: python

Y
ying 已提交
222 223 224
            # Suppose q, k, v are Tensors with the following shape:
            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]

Y
ying 已提交
225 226
            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
            contexts.shape  # [3, 5, 10]
227
    """
Y
ying 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs quries, keys and values should all be 3-D tensors.")

    if queries.shape[-1] != keys.shape[-1]:
        raise ValueError(
            "The hidden size of queries and keys should be the same.")
    if keys.shape[-2] != values.shape[-2]:
        raise ValueError(
            "The max sequence length in query batch and in key batch "
            "should be the same.")
    if keys.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of keys (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (keys.shape[-1], num_heads))
    if values.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of values (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (values.shape[-1], num_heads))

Y
ying 已提交
248
    def __compute_qkv(queries, keys, values, num_heads):
Y
ying 已提交
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
        """
        Add linear projection to queries, keys, and values.

        Args:
            queries(Tensor): a 3-D input Tensor.
            keys(Tensor): a 3-D input Tensor.
            values(Tensor): a 3-D input Tensor.
            num_heads(int): The number of heads. Linearly project the inputs
                            ONLY when num_heads > 1.

        Returns:
            Tensor: linearly projected output Tensors: queries', keys' and
                    values'. They have the same shapes with queries, keys and
                    values.
        """

Y
ying 已提交
265 266 267 268 269 270 271 272
        if num_heads == 1:
            return queries, keys, values

        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
        return q, k, v

Y
ying 已提交
273 274 275 276 277 278
    def __split_heads(x, num_heads):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions.

        Args:
Y
ying 已提交
279 280
            x(Tensor): a 3-D input Tensor.
            num_heads(int): The number of heads.
Y
ying 已提交
281 282

        Returns:
Y
ying 已提交
283 284
            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
                    of the last dimension of x.
Y
ying 已提交
285
        """
Y
ying 已提交
286 287
        if num_heads == 1:
            return x
288

Y
ying 已提交
289
        hidden_size = x.shape[-1]
290 291 292
        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
        # into a 4-D output:
        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
Y
ying 已提交
293
        reshaped = layers.reshape(
294 295
            x=x,
            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
296 297

        # permuate the dimensions into:
298 299 300 301
        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
Y
ying 已提交
302 303 304 305 306 307 308 309 310 311 312 313 314
        """
        Reshape the last two dimensions of inpunt tensor x so that it becomes
        one dimension.

        Args:
            x(Tensor): a 4-D input Tensor with shape
                       [bs, num_heads, max_sequence_length, hidden_dim].

        Returns:
            Tensor: a Tensor with shape
                    [bs, max_sequence_length, num_heads * hidden_dim].
        """

Y
ying 已提交
315
        if len(x.shape) == 3: return x
316 317 318
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

Y
ying 已提交
319
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
Y
ying 已提交
320
        return layers.reshape(
321
            x=trans_x,
Y
ying 已提交
322 323 324 325
            shape=map(int, [
                trans_x.shape[0], trans_x.shape[1],
                trans_x.shape[2] * trans_x.shape[3]
            ]))
326

Y
ying 已提交
327 328 329 330 331
    q, k, v = __compute_qkv(queries, keys, values, num_heads)

    q = __split_heads(q, num_heads)
    k = __split_heads(k, num_heads)
    v = __split_heads(v, num_heads)
Y
ying 已提交
332 333

    key_dim_per_head = keys.shape[-1] // num_heads
334 335
    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
Y
ying 已提交
336

Y
ying 已提交
337
    weights = layers.reshape(
338
        x=layers.reshape(
Y
ying 已提交
339
            x=product, shape=[-1, product.shape[-1]], act="softmax"),
340
        shape=product.shape)
Y
ying 已提交
341
    if dropout_rate:
G
guosheng 已提交
342 343
        weights = layers.dropout(
            weights, dropout_prob=dropout_rate, is_test=False)
Y
ying 已提交
344 345
    ctx_multiheads = layers.matmul(weights, v)
    return __combine_heads(ctx_multiheads)